1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
2; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
5
6define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
7; CHECK-LABEL: @udiv_i32(
8; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
9; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
10; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
11; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
12; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
13; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
14; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
15; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
16; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
17; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
18; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
19; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
20; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
21; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
22; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
23; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
24; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
25; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
26; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
27; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
28; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
29; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
30; CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[TMP19]], 1
31; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]]
32; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]]
33; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]]
34; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]]
35; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP24]], 1
36; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]]
37; CHECK-NEXT:    store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4
38; CHECK-NEXT:    ret void
39;
40; GFX6-LABEL: udiv_i32:
41; GFX6:       ; %bb.0:
42; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
43; GFX6-NEXT:    s_mov_b32 s7, 0xf000
44; GFX6-NEXT:    s_mov_b32 s6, -1
45; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
47; GFX6-NEXT:    s_sub_i32 s4, 0, s3
48; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
49; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
50; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
51; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
52; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
53; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
54; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
55; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
56; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
57; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
58; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
59; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
60; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
61; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
62; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
63; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
64; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
65; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
66; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
67; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
68; GFX6-NEXT:    s_endpgm
69;
70; GFX9-LABEL: udiv_i32:
71; GFX9:       ; %bb.0:
72; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
73; GFX9-NEXT:    v_mov_b32_e32 v2, 0
74; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
75; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
76; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
77; GFX9-NEXT:    s_sub_i32 s4, 0, s3
78; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
79; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
80; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
81; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
82; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
83; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
84; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
85; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
86; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
87; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
88; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
89; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
90; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
91; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
92; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
93; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
94; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
95; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
96; GFX9-NEXT:    s_endpgm
97  %r = udiv i32 %x, %y
98  store i32 %r, i32 addrspace(1)* %out
99  ret void
100}
101
102define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
103; CHECK-LABEL: @urem_i32(
104; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
105; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
106; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
107; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
108; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
109; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
110; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
111; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
112; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
113; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
114; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
115; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
116; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
117; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
118; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
119; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
120; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
121; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
122; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
123; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
124; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
125; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
126; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]]
127; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]]
128; CHECK-NEXT:    [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]]
129; CHECK-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]]
130; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]]
131; CHECK-NEXT:    store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4
132; CHECK-NEXT:    ret void
133;
134; GFX6-LABEL: urem_i32:
135; GFX6:       ; %bb.0:
136; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
137; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
138; GFX6-NEXT:    s_mov_b32 s3, 0xf000
139; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
140; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
141; GFX6-NEXT:    s_sub_i32 s2, 0, s5
142; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
143; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
144; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
145; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
146; GFX6-NEXT:    s_mov_b32 s2, -1
147; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
148; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
149; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
150; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s5
151; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
152; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
153; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
154; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
155; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
156; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
157; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
158; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
159; GFX6-NEXT:    s_endpgm
160;
161; GFX9-LABEL: urem_i32:
162; GFX9:       ; %bb.0:
163; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
164; GFX9-NEXT:    v_mov_b32_e32 v1, 0
165; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
166; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
167; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
168; GFX9-NEXT:    s_sub_i32 s4, 0, s3
169; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
170; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
171; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
172; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
173; GFX9-NEXT:    s_mul_i32 s4, s4, s5
174; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
175; GFX9-NEXT:    s_add_i32 s5, s5, s4
176; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
177; GFX9-NEXT:    s_mul_i32 s4, s4, s3
178; GFX9-NEXT:    s_sub_i32 s2, s2, s4
179; GFX9-NEXT:    s_sub_i32 s4, s2, s3
180; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
181; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
182; GFX9-NEXT:    s_sub_i32 s4, s2, s3
183; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
184; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
185; GFX9-NEXT:    v_mov_b32_e32 v0, s2
186; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
187; GFX9-NEXT:    s_endpgm
188  %r = urem i32 %x, %y
189  store i32 %r, i32 addrspace(1)* %out
190  ret void
191}
192
193define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
194; CHECK-LABEL: @sdiv_i32(
195; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
196; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
197; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
198; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[X]], [[TMP1]]
199; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]]
200; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]]
201; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]]
202; CHECK-NEXT:    [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float
203; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]])
204; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000
205; CHECK-NEXT:    [[TMP11:%.*]] = fptoui float [[TMP10]] to i32
206; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 0, [[TMP7]]
207; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]]
208; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP11]] to i64
209; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
210; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
211; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
212; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
213; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
214; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]]
215; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP6]] to i64
216; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
217; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]]
218; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
219; CHECK-NEXT:    [[TMP25:%.*]] = lshr i64 [[TMP23]], 32
220; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
221; CHECK-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]]
222; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]]
223; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]]
224; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
225; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
226; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]]
227; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]]
228; CHECK-NEXT:    [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]]
229; CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP31]], 1
230; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]]
231; CHECK-NEXT:    [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]]
232; CHECK-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]]
233; CHECK-NEXT:    store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4
234; CHECK-NEXT:    ret void
235;
236; GFX6-LABEL: sdiv_i32:
237; GFX6:       ; %bb.0:
238; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
239; GFX6-NEXT:    s_mov_b32 s7, 0xf000
240; GFX6-NEXT:    s_mov_b32 s6, -1
241; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
242; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
243; GFX6-NEXT:    s_add_i32 s3, s3, s8
244; GFX6-NEXT:    s_xor_b32 s3, s3, s8
245; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
246; GFX6-NEXT:    s_sub_i32 s4, 0, s3
247; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
248; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
249; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
250; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
251; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
252; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
253; GFX6-NEXT:    s_add_i32 s1, s2, s0
254; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
255; GFX6-NEXT:    s_xor_b32 s1, s1, s0
256; GFX6-NEXT:    s_xor_b32 s2, s0, s8
257; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
258; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
259; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
260; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
261; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
262; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
263; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
264; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
265; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
266; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
267; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
268; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
269; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
270; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
271; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
272; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
273; GFX6-NEXT:    s_endpgm
274;
275; GFX9-LABEL: sdiv_i32:
276; GFX9:       ; %bb.0:
277; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
278; GFX9-NEXT:    v_mov_b32_e32 v2, 0
279; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
280; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
282; GFX9-NEXT:    s_add_i32 s3, s3, s4
283; GFX9-NEXT:    s_xor_b32 s3, s3, s4
284; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
285; GFX9-NEXT:    s_sub_i32 s5, 0, s3
286; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
287; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
288; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
289; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
290; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
291; GFX9-NEXT:    s_add_i32 s2, s2, s5
292; GFX9-NEXT:    s_xor_b32 s2, s2, s5
293; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
294; GFX9-NEXT:    s_xor_b32 s4, s5, s4
295; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
296; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
297; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
298; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
299; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
300; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
301; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
302; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
303; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
304; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
305; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
306; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
307; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
308; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
309; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
310; GFX9-NEXT:    s_endpgm
311  %r = sdiv i32 %x, %y
312  store i32 %r, i32 addrspace(1)* %out
313  ret void
314}
315
316define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
317; CHECK-LABEL: @srem_i32(
318; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
319; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
320; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[X]], [[TMP1]]
321; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]]
322; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]]
323; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]]
324; CHECK-NEXT:    [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float
325; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
326; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000
327; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP9]] to i32
328; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 0, [[TMP6]]
329; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]]
330; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP10]] to i64
331; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
332; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]]
333; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
334; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP15]], 32
335; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
336; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]]
337; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
338; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
339; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]]
340; CHECK-NEXT:    [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32
341; CHECK-NEXT:    [[TMP24:%.*]] = lshr i64 [[TMP22]], 32
342; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
343; CHECK-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]]
344; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]]
345; CHECK-NEXT:    [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]]
346; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]]
347; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]]
348; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]]
349; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]]
350; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]]
351; CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]]
352; CHECK-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]]
353; CHECK-NEXT:    store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4
354; CHECK-NEXT:    ret void
355;
356; GFX6-LABEL: srem_i32:
357; GFX6:       ; %bb.0:
358; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
359; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
360; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
361; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
362; GFX6-NEXT:    s_add_i32 s3, s3, s4
363; GFX6-NEXT:    s_xor_b32 s4, s3, s4
364; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
365; GFX6-NEXT:    s_sub_i32 s3, 0, s4
366; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
367; GFX6-NEXT:    s_add_i32 s2, s2, s5
368; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
369; GFX6-NEXT:    s_xor_b32 s6, s2, s5
370; GFX6-NEXT:    s_mov_b32 s2, -1
371; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
372; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
373; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
374; GFX6-NEXT:    s_mov_b32 s3, 0xf000
375; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
376; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
377; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
378; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
379; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
380; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
381; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
382; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
383; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
384; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
385; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
386; GFX6-NEXT:    v_xor_b32_e32 v0, s5, v0
387; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
388; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
389; GFX6-NEXT:    s_endpgm
390;
391; GFX9-LABEL: srem_i32:
392; GFX9:       ; %bb.0:
393; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
394; GFX9-NEXT:    v_mov_b32_e32 v1, 0
395; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
396; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
397; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
398; GFX9-NEXT:    s_add_i32 s3, s3, s4
399; GFX9-NEXT:    s_xor_b32 s3, s3, s4
400; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
401; GFX9-NEXT:    s_sub_i32 s5, 0, s3
402; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
403; GFX9-NEXT:    s_add_i32 s2, s2, s4
404; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
405; GFX9-NEXT:    s_xor_b32 s2, s2, s4
406; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
407; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
408; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
409; GFX9-NEXT:    s_mul_i32 s5, s5, s6
410; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
411; GFX9-NEXT:    s_add_i32 s6, s6, s5
412; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
413; GFX9-NEXT:    s_mul_i32 s5, s5, s3
414; GFX9-NEXT:    s_sub_i32 s2, s2, s5
415; GFX9-NEXT:    s_sub_i32 s5, s2, s3
416; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
417; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
418; GFX9-NEXT:    s_sub_i32 s5, s2, s3
419; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
420; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
421; GFX9-NEXT:    s_xor_b32 s2, s2, s4
422; GFX9-NEXT:    s_sub_i32 s2, s2, s4
423; GFX9-NEXT:    v_mov_b32_e32 v0, s2
424; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
425; GFX9-NEXT:    s_endpgm
426  %r = srem i32 %x, %y
427  store i32 %r, i32 addrspace(1)* %out
428  ret void
429}
430
431define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
432; CHECK-LABEL: @udiv_i16(
433; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
434; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
435; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
436; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
437; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
438; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
439; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
440; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
441; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
442; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
443; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
444; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
445; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
446; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
447; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
448; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
449; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
450; CHECK-NEXT:    store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2
451; CHECK-NEXT:    ret void
452;
453; GFX6-LABEL: udiv_i16:
454; GFX6:       ; %bb.0:
455; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xb
456; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
457; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
458; GFX6-NEXT:    s_lshr_b32 s3, s2, 16
459; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
460; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
461; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s2
462; GFX6-NEXT:    s_mov_b32 s3, 0xf000
463; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
464; GFX6-NEXT:    s_mov_b32 s2, -1
465; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
466; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
467; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
468; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
469; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
470; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
471; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
472; GFX6-NEXT:    s_endpgm
473;
474; GFX9-LABEL: udiv_i16:
475; GFX9:       ; %bb.0:
476; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
477; GFX9-NEXT:    v_mov_b32_e32 v3, 0
478; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
479; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
480; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
481; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
482; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
483; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
484; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
485; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
486; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
487; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
488; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
489; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
490; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
491; GFX9-NEXT:    global_store_short v3, v0, s[0:1]
492; GFX9-NEXT:    s_endpgm
493  %r = udiv i16 %x, %y
494  store i16 %r, i16 addrspace(1)* %out
495  ret void
496}
497
498define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
499; CHECK-LABEL: @urem_i16(
500; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
501; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
502; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
503; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
504; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
505; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
506; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
507; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
508; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
509; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
510; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
511; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
512; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
513; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
514; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
515; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
516; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
517; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
518; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
519; CHECK-NEXT:    store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2
520; CHECK-NEXT:    ret void
521;
522; GFX6-LABEL: urem_i16:
523; GFX6:       ; %bb.0:
524; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
525; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
526; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
527; GFX6-NEXT:    s_lshr_b32 s2, s4, 16
528; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
529; GFX6-NEXT:    s_and_b32 s3, s4, 0xffff
530; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
531; GFX6-NEXT:    s_mov_b32 s3, 0xf000
532; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
533; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
534; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
535; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
536; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
537; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
538; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
539; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
540; GFX6-NEXT:    s_mov_b32 s2, -1
541; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
542; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
543; GFX6-NEXT:    s_endpgm
544;
545; GFX9-LABEL: urem_i16:
546; GFX9:       ; %bb.0:
547; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
548; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
549; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
550; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
551; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
552; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s4
553; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
554; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
555; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
556; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
557; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
558; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
559; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
560; GFX9-NEXT:    v_mov_b32_e32 v1, 0
561; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
562; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
563; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
564; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
565; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
566; GFX9-NEXT:    s_endpgm
567  %r = urem i16 %x, %y
568  store i16 %r, i16 addrspace(1)* %out
569  ret void
570}
571
572define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
573; CHECK-LABEL: @sdiv_i16(
574; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
575; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
576; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
577; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
578; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
579; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
580; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
581; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
582; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
583; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
584; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
585; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
586; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
587; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
588; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
589; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
590; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
591; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
592; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 16
593; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 16
594; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
595; CHECK-NEXT:    store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2
596; CHECK-NEXT:    ret void
597;
598; GFX6-LABEL: sdiv_i16:
599; GFX6:       ; %bb.0:
600; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
601; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
602; GFX6-NEXT:    s_mov_b32 s3, 0xf000
603; GFX6-NEXT:    s_mov_b32 s2, -1
604; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
605; GFX6-NEXT:    s_ashr_i32 s5, s4, 16
606; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
607; GFX6-NEXT:    s_sext_i32_i16 s4, s4
608; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
609; GFX6-NEXT:    s_xor_b32 s4, s4, s5
610; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
611; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
612; GFX6-NEXT:    s_or_b32 s4, s4, 1
613; GFX6-NEXT:    v_mov_b32_e32 v3, s4
614; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
615; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
616; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
617; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
618; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
619; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
620; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
621; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
622; GFX6-NEXT:    s_endpgm
623;
624; GFX9-LABEL: sdiv_i16:
625; GFX9:       ; %bb.0:
626; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
627; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
628; GFX9-NEXT:    v_mov_b32_e32 v1, 0
629; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
630; GFX9-NEXT:    s_ashr_i32 s0, s4, 16
631; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
632; GFX9-NEXT:    s_sext_i32_i16 s1, s4
633; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
634; GFX9-NEXT:    s_xor_b32 s0, s1, s0
635; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
636; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
637; GFX9-NEXT:    s_or_b32 s4, s0, 1
638; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
639; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
640; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
641; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
642; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
643; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
644; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
645; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
646; GFX9-NEXT:    global_store_short v1, v0, s[2:3]
647; GFX9-NEXT:    s_endpgm
648  %r = sdiv i16 %x, %y
649  store i16 %r, i16 addrspace(1)* %out
650  ret void
651}
652
653define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
654; CHECK-LABEL: @srem_i16(
655; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
656; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
657; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
658; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
659; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
660; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
661; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
662; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
663; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
664; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
665; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
666; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
667; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
668; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
669; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
670; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
671; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
672; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
673; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
674; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
675; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
676; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
677; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
678; CHECK-NEXT:    store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2
679; CHECK-NEXT:    ret void
680;
681; GFX6-LABEL: srem_i16:
682; GFX6:       ; %bb.0:
683; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
684; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
685; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
686; GFX6-NEXT:    s_ashr_i32 s2, s4, 16
687; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
688; GFX6-NEXT:    s_sext_i32_i16 s3, s4
689; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
690; GFX6-NEXT:    s_xor_b32 s3, s3, s2
691; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
692; GFX6-NEXT:    s_ashr_i32 s3, s3, 30
693; GFX6-NEXT:    s_or_b32 s3, s3, 1
694; GFX6-NEXT:    v_mov_b32_e32 v3, s3
695; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
696; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
697; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
698; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
699; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
700; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
701; GFX6-NEXT:    s_mov_b32 s3, 0xf000
702; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
703; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
704; GFX6-NEXT:    s_mov_b32 s2, -1
705; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
706; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
707; GFX6-NEXT:    s_endpgm
708;
709; GFX9-LABEL: srem_i16:
710; GFX9:       ; %bb.0:
711; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
712; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
713; GFX9-NEXT:    s_ashr_i32 s5, s4, 16
714; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s5
715; GFX9-NEXT:    s_sext_i32_i16 s2, s4
716; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
717; GFX9-NEXT:    s_xor_b32 s2, s2, s5
718; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
719; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
720; GFX9-NEXT:    s_or_b32 s6, s2, 1
721; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
722; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
723; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
724; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
725; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
726; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
727; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
728; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
729; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
730; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
731; GFX9-NEXT:    v_mov_b32_e32 v1, 0
732; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
733; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
734; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
735; GFX9-NEXT:    s_endpgm
736  %r = srem i16 %x, %y
737  store i16 %r, i16 addrspace(1)* %out
738  ret void
739}
740
741define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
742; CHECK-LABEL: @udiv_i8(
743; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
744; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
745; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
746; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
747; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
748; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
749; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
750; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
751; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
752; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
753; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
754; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
755; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
756; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
757; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
758; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 255
759; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
760; CHECK-NEXT:    store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1
761; CHECK-NEXT:    ret void
762;
763; GFX6-LABEL: udiv_i8:
764; GFX6:       ; %bb.0:
765; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
766; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
767; GFX6-NEXT:    s_mov_b32 s3, 0xf000
768; GFX6-NEXT:    s_mov_b32 s2, -1
769; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
770; GFX6-NEXT:    v_cvt_f32_ubyte1_e32 v0, s4
771; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
772; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
773; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
774; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
775; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
776; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
777; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
778; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
779; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
780; GFX6-NEXT:    s_endpgm
781;
782; GFX9-LABEL: udiv_i8:
783; GFX9:       ; %bb.0:
784; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
785; GFX9-NEXT:    v_mov_b32_e32 v2, 0
786; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
787; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
788; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
789; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
790; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
791; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
792; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
793; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v1
794; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v3
795; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
796; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
797; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
798; GFX9-NEXT:    s_endpgm
799  %r = udiv i8 %x, %y
800  store i8 %r, i8 addrspace(1)* %out
801  ret void
802}
803
804define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
805; CHECK-LABEL: @urem_i8(
806; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
807; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
808; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
809; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
810; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
811; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
812; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
813; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
814; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
815; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
816; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
817; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
818; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
819; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
820; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
821; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
822; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
823; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 255
824; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8
825; CHECK-NEXT:    store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1
826; CHECK-NEXT:    ret void
827;
828; GFX6-LABEL: urem_i8:
829; GFX6:       ; %bb.0:
830; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
831; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
832; GFX6-NEXT:    s_mov_b32 s3, 0xf000
833; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
834; GFX6-NEXT:    v_cvt_f32_ubyte1_e32 v0, s4
835; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
836; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
837; GFX6-NEXT:    s_lshr_b32 s2, s4, 8
838; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
839; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
840; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
841; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
842; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
843; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
844; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
845; GFX6-NEXT:    s_mov_b32 s2, -1
846; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
847; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
848; GFX6-NEXT:    s_endpgm
849;
850; GFX9-LABEL: urem_i8:
851; GFX9:       ; %bb.0:
852; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
853; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
854; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
855; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
856; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
857; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
858; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
859; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
860; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
861; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v1
862; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v2
863; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
864; GFX9-NEXT:    v_mov_b32_e32 v1, 0
865; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
866; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
867; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
868; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
869; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
870; GFX9-NEXT:    s_endpgm
871  %r = urem i8 %x, %y
872  store i8 %r, i8 addrspace(1)* %out
873  ret void
874}
875
876define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
877; CHECK-LABEL: @sdiv_i8(
878; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
879; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
880; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
881; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
882; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
883; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
884; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
885; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
886; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
887; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
888; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
889; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
890; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
891; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
892; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
893; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
894; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
895; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
896; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 24
897; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 24
898; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
899; CHECK-NEXT:    store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1
900; CHECK-NEXT:    ret void
901;
902; GFX6-LABEL: sdiv_i8:
903; GFX6:       ; %bb.0:
904; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
905; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
906; GFX6-NEXT:    s_mov_b32 s3, 0xf000
907; GFX6-NEXT:    s_mov_b32 s2, -1
908; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
909; GFX6-NEXT:    s_bfe_i32 s5, s4, 0x80008
910; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
911; GFX6-NEXT:    s_sext_i32_i8 s4, s4
912; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
913; GFX6-NEXT:    s_xor_b32 s4, s4, s5
914; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
915; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
916; GFX6-NEXT:    s_or_b32 s4, s4, 1
917; GFX6-NEXT:    v_mov_b32_e32 v3, s4
918; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
919; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
920; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
921; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
922; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
923; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
924; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
925; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
926; GFX6-NEXT:    s_endpgm
927;
928; GFX9-LABEL: sdiv_i8:
929; GFX9:       ; %bb.0:
930; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
931; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
932; GFX9-NEXT:    v_mov_b32_e32 v1, 0
933; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
934; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x80008
935; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
936; GFX9-NEXT:    s_sext_i32_i8 s1, s4
937; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
938; GFX9-NEXT:    s_xor_b32 s0, s1, s0
939; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
940; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
941; GFX9-NEXT:    s_or_b32 s4, s0, 1
942; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
943; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
944; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
945; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
946; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
947; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
948; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
949; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
950; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
951; GFX9-NEXT:    s_endpgm
952  %r = sdiv i8 %x, %y
953  store i8 %r, i8 addrspace(1)* %out
954  ret void
955}
956
957define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
958; CHECK-LABEL: @srem_i8(
959; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
960; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
961; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
962; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
963; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
964; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
965; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
966; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
967; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
968; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
969; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
970; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
971; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
972; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
973; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
974; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
975; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
976; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
977; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
978; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
979; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 24
980; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 24
981; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8
982; CHECK-NEXT:    store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1
983; CHECK-NEXT:    ret void
984;
985; GFX6-LABEL: srem_i8:
986; GFX6:       ; %bb.0:
987; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
988; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
989; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
990; GFX6-NEXT:    s_bfe_i32 s2, s4, 0x80008
991; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
992; GFX6-NEXT:    s_sext_i32_i8 s5, s4
993; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
994; GFX6-NEXT:    s_xor_b32 s2, s5, s2
995; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
996; GFX6-NEXT:    s_ashr_i32 s2, s2, 30
997; GFX6-NEXT:    s_or_b32 s2, s2, 1
998; GFX6-NEXT:    v_mov_b32_e32 v3, s2
999; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
1000; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
1001; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
1002; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
1003; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
1004; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
1005; GFX6-NEXT:    s_lshr_b32 s3, s4, 8
1006; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1007; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
1008; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1009; GFX6-NEXT:    s_mov_b32 s2, -1
1010; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1011; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1012; GFX6-NEXT:    s_endpgm
1013;
1014; GFX9-LABEL: srem_i8:
1015; GFX9:       ; %bb.0:
1016; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
1017; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1018; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1019; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x80008
1020; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
1021; GFX9-NEXT:    s_sext_i32_i8 s1, s4
1022; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s1
1023; GFX9-NEXT:    s_xor_b32 s0, s1, s0
1024; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1025; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
1026; GFX9-NEXT:    s_lshr_b32 s5, s4, 8
1027; GFX9-NEXT:    s_or_b32 s6, s0, 1
1028; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
1029; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
1030; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
1031; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
1032; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
1033; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
1034; GFX9-NEXT:    s_cselect_b32 s0, s6, 0
1035; GFX9-NEXT:    v_add_u32_e32 v0, s0, v2
1036; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
1037; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1038; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
1039; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
1040; GFX9-NEXT:    s_endpgm
1041  %r = srem i8 %x, %y
1042  store i8 %r, i8 addrspace(1)* %out
1043  ret void
1044}
1045
1046define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1047; CHECK-LABEL: @udiv_v4i32(
1048; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1049; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1050; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1051; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1052; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
1053; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1054; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
1055; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
1056; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
1057; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
1058; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
1059; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1060; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
1061; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
1062; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
1063; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
1064; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1065; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1066; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1067; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1068; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1069; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
1070; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
1071; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
1072; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
1073; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
1074; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
1075; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
1076; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
1077; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
1078; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
1079; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0
1080; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1
1081; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1082; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
1083; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
1084; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
1085; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
1086; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
1087; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
1088; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
1089; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
1090; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
1091; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
1092; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
1093; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
1094; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
1095; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
1096; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
1097; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
1098; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1099; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
1100; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
1101; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
1102; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
1103; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
1104; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
1105; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
1106; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
1107; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
1108; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
1109; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
1110; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
1111; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1
1112; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2
1113; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1114; CHECK-NEXT:    [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float
1115; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]])
1116; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000
1117; CHECK-NEXT:    [[TMP70:%.*]] = fptoui float [[TMP69]] to i32
1118; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 0, [[TMP66]]
1119; CHECK-NEXT:    [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]]
1120; CHECK-NEXT:    [[TMP73:%.*]] = zext i32 [[TMP70]] to i64
1121; CHECK-NEXT:    [[TMP74:%.*]] = zext i32 [[TMP72]] to i64
1122; CHECK-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]]
1123; CHECK-NEXT:    [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32
1124; CHECK-NEXT:    [[TMP77:%.*]] = lshr i64 [[TMP75]], 32
1125; CHECK-NEXT:    [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32
1126; CHECK-NEXT:    [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]]
1127; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP65]] to i64
1128; CHECK-NEXT:    [[TMP81:%.*]] = zext i32 [[TMP79]] to i64
1129; CHECK-NEXT:    [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]]
1130; CHECK-NEXT:    [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32
1131; CHECK-NEXT:    [[TMP84:%.*]] = lshr i64 [[TMP82]], 32
1132; CHECK-NEXT:    [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32
1133; CHECK-NEXT:    [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]]
1134; CHECK-NEXT:    [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]]
1135; CHECK-NEXT:    [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]]
1136; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP85]], 1
1137; CHECK-NEXT:    [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]]
1138; CHECK-NEXT:    [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]]
1139; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]]
1140; CHECK-NEXT:    [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]]
1141; CHECK-NEXT:    [[TMP94:%.*]] = add i32 [[TMP90]], 1
1142; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]]
1143; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2
1144; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3
1145; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1146; CHECK-NEXT:    [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float
1147; CHECK-NEXT:    [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]])
1148; CHECK-NEXT:    [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000
1149; CHECK-NEXT:    [[TMP102:%.*]] = fptoui float [[TMP101]] to i32
1150; CHECK-NEXT:    [[TMP103:%.*]] = sub i32 0, [[TMP98]]
1151; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]]
1152; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP102]] to i64
1153; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1154; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1155; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1156; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1157; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1158; CHECK-NEXT:    [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]]
1159; CHECK-NEXT:    [[TMP112:%.*]] = zext i32 [[TMP97]] to i64
1160; CHECK-NEXT:    [[TMP113:%.*]] = zext i32 [[TMP111]] to i64
1161; CHECK-NEXT:    [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]]
1162; CHECK-NEXT:    [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32
1163; CHECK-NEXT:    [[TMP116:%.*]] = lshr i64 [[TMP114]], 32
1164; CHECK-NEXT:    [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32
1165; CHECK-NEXT:    [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]]
1166; CHECK-NEXT:    [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]]
1167; CHECK-NEXT:    [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]]
1168; CHECK-NEXT:    [[TMP121:%.*]] = add i32 [[TMP117]], 1
1169; CHECK-NEXT:    [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]]
1170; CHECK-NEXT:    [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]]
1171; CHECK-NEXT:    [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]]
1172; CHECK-NEXT:    [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]]
1173; CHECK-NEXT:    [[TMP126:%.*]] = add i32 [[TMP122]], 1
1174; CHECK-NEXT:    [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]]
1175; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3
1176; CHECK-NEXT:    store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1177; CHECK-NEXT:    ret void
1178;
1179; GFX6-LABEL: udiv_v4i32:
1180; GFX6:       ; %bb.0:
1181; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1182; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
1183; GFX6-NEXT:    s_mov_b32 s15, 0xf000
1184; GFX6-NEXT:    s_mov_b32 s14, -1
1185; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1186; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
1187; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
1188; GFX6-NEXT:    s_sub_i32 s2, 0, s8
1189; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s10
1190; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1191; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1192; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s11
1193; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1194; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1195; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1196; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1197; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
1198; GFX6-NEXT:    s_sub_i32 s2, 0, s9
1199; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
1200; GFX6-NEXT:    s_sub_i32 s2, 0, s10
1201; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
1202; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
1203; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1204; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
1205; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
1206; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
1207; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
1208; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
1209; GFX6-NEXT:    v_mul_lo_u32 v5, v1, s9
1210; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
1211; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
1212; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
1213; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
1214; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
1215; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
1216; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
1217; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v4
1218; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1219; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s5, v5
1220; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
1221; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
1222; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1223; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
1224; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1225; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v2
1226; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
1227; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1228; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v4
1229; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v1
1230; GFX6-NEXT:    s_sub_i32 s0, 0, s11
1231; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
1232; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v6
1233; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
1234; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1235; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
1236; GFX6-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
1237; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
1238; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s10
1239; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
1240; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
1241; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
1242; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v3
1243; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
1244; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
1245; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s10, v3
1246; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1247; GFX6-NEXT:    v_mul_hi_u32 v4, s7, v4
1248; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
1249; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
1250; GFX6-NEXT:    v_mul_lo_u32 v6, v4, s11
1251; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
1252; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1253; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1254; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v6
1255; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
1256; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
1257; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s11, v3
1258; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1259; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1260; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1261; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
1262; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
1263; GFX6-NEXT:    s_endpgm
1264;
1265; GFX9-LABEL: udiv_v4i32:
1266; GFX9:       ; %bb.0:
1267; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1268; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1269; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1270; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1271; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
1272; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
1273; GFX9-NEXT:    s_sub_i32 s2, 0, s8
1274; GFX9-NEXT:    s_sub_i32 s3, 0, s9
1275; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1276; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1277; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
1278; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s11
1279; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1280; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1281; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1282; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1283; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
1284; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
1285; GFX9-NEXT:    s_sub_i32 s2, 0, s10
1286; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
1287; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
1288; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
1289; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
1290; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
1291; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
1292; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
1293; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1294; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v5
1295; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
1296; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s8
1297; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
1298; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
1299; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
1300; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
1301; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
1302; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
1303; GFX9-NEXT:    v_subrev_u32_e32 v7, s8, v3
1304; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
1305; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
1306; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v2
1307; GFX9-NEXT:    s_sub_i32 s2, 0, s11
1308; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s9
1309; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
1310; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
1311; GFX9-NEXT:    v_add_u32_e32 v8, 1, v1
1312; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v5
1313; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
1314; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
1315; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v6
1316; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
1317; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
1318; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
1319; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v3
1320; GFX9-NEXT:    v_mul_lo_u32 v8, v2, s10
1321; GFX9-NEXT:    v_subrev_u32_e32 v7, s9, v5
1322; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
1323; GFX9-NEXT:    v_add_u32_e32 v3, v6, v3
1324; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
1325; GFX9-NEXT:    v_add_u32_e32 v7, 1, v1
1326; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
1327; GFX9-NEXT:    v_sub_u32_e32 v5, s6, v8
1328; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
1329; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v5
1330; GFX9-NEXT:    v_subrev_u32_e32 v6, s10, v5
1331; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
1332; GFX9-NEXT:    v_mul_lo_u32 v6, v3, s11
1333; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
1334; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
1335; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
1336; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v5
1337; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v6
1338; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
1339; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
1340; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
1341; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1342; GFX9-NEXT:    v_subrev_u32_e32 v6, s11, v5
1343; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
1344; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
1345; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
1346; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1347; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1348; GFX9-NEXT:    s_endpgm
1349  %r = udiv <4 x i32> %x, %y
1350  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1351  ret void
1352}
1353
1354define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1355; CHECK-LABEL: @urem_v4i32(
1356; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1357; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1358; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1359; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1360; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
1361; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1362; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
1363; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
1364; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
1365; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
1366; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
1367; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1368; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
1369; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
1370; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
1371; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
1372; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1373; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1374; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1375; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1376; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1377; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
1378; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
1379; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
1380; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
1381; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
1382; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
1383; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
1384; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
1385; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0
1386; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1
1387; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1388; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
1389; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
1390; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
1391; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
1392; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
1393; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
1394; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
1395; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
1396; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
1397; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
1398; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
1399; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
1400; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
1401; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
1402; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
1403; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
1404; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
1405; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
1406; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1407; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
1408; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
1409; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
1410; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
1411; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
1412; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
1413; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
1414; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
1415; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1
1416; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2
1417; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1418; CHECK-NEXT:    [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float
1419; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]])
1420; CHECK-NEXT:    [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000
1421; CHECK-NEXT:    [[TMP66:%.*]] = fptoui float [[TMP65]] to i32
1422; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 0, [[TMP62]]
1423; CHECK-NEXT:    [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]]
1424; CHECK-NEXT:    [[TMP69:%.*]] = zext i32 [[TMP66]] to i64
1425; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP68]] to i64
1426; CHECK-NEXT:    [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]]
1427; CHECK-NEXT:    [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32
1428; CHECK-NEXT:    [[TMP73:%.*]] = lshr i64 [[TMP71]], 32
1429; CHECK-NEXT:    [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32
1430; CHECK-NEXT:    [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]]
1431; CHECK-NEXT:    [[TMP76:%.*]] = zext i32 [[TMP61]] to i64
1432; CHECK-NEXT:    [[TMP77:%.*]] = zext i32 [[TMP75]] to i64
1433; CHECK-NEXT:    [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]]
1434; CHECK-NEXT:    [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32
1435; CHECK-NEXT:    [[TMP80:%.*]] = lshr i64 [[TMP78]], 32
1436; CHECK-NEXT:    [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32
1437; CHECK-NEXT:    [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]]
1438; CHECK-NEXT:    [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]]
1439; CHECK-NEXT:    [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]]
1440; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]]
1441; CHECK-NEXT:    [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]]
1442; CHECK-NEXT:    [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]]
1443; CHECK-NEXT:    [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]]
1444; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]]
1445; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2
1446; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3
1447; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1448; CHECK-NEXT:    [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float
1449; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]])
1450; CHECK-NEXT:    [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000
1451; CHECK-NEXT:    [[TMP96:%.*]] = fptoui float [[TMP95]] to i32
1452; CHECK-NEXT:    [[TMP97:%.*]] = sub i32 0, [[TMP92]]
1453; CHECK-NEXT:    [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]]
1454; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP96]] to i64
1455; CHECK-NEXT:    [[TMP100:%.*]] = zext i32 [[TMP98]] to i64
1456; CHECK-NEXT:    [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]]
1457; CHECK-NEXT:    [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32
1458; CHECK-NEXT:    [[TMP103:%.*]] = lshr i64 [[TMP101]], 32
1459; CHECK-NEXT:    [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32
1460; CHECK-NEXT:    [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]]
1461; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP91]] to i64
1462; CHECK-NEXT:    [[TMP107:%.*]] = zext i32 [[TMP105]] to i64
1463; CHECK-NEXT:    [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]]
1464; CHECK-NEXT:    [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32
1465; CHECK-NEXT:    [[TMP110:%.*]] = lshr i64 [[TMP108]], 32
1466; CHECK-NEXT:    [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32
1467; CHECK-NEXT:    [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]]
1468; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]]
1469; CHECK-NEXT:    [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]]
1470; CHECK-NEXT:    [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]]
1471; CHECK-NEXT:    [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]]
1472; CHECK-NEXT:    [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]]
1473; CHECK-NEXT:    [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]]
1474; CHECK-NEXT:    [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]]
1475; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3
1476; CHECK-NEXT:    store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1477; CHECK-NEXT:    ret void
1478;
1479; GFX6-LABEL: urem_v4i32:
1480; GFX6:       ; %bb.0:
1481; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1482; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1483; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1484; GFX6-NEXT:    s_mov_b32 s2, -1
1485; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1486; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
1487; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
1488; GFX6-NEXT:    s_sub_i32 s12, 0, s8
1489; GFX6-NEXT:    s_sub_i32 s13, 0, s9
1490; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1491; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1492; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
1493; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s11
1494; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1495; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1496; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1497; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1498; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1499; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v0
1500; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v1
1501; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
1502; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
1503; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1504; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
1505; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
1506; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
1507; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
1508; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v3
1509; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
1510; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
1511; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1512; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
1513; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1514; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1515; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
1516; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1517; GFX6-NEXT:    s_sub_i32 s4, 0, s10
1518; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1519; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v2
1520; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
1521; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
1522; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1523; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
1524; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1525; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
1526; GFX6-NEXT:    s_sub_i32 s4, 0, s11
1527; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1528; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v4
1529; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
1530; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
1531; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
1532; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v3
1533; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1534; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1535; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s10
1536; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
1537; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
1538; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
1539; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
1540; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
1541; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1542; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s11
1543; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1544; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
1545; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1546; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
1547; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
1548; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1549; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1550; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1551; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1552; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1553; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1554; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1555; GFX6-NEXT:    s_endpgm
1556;
1557; GFX9-LABEL: urem_v4i32:
1558; GFX9:       ; %bb.0:
1559; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1560; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1561; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1562; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1563; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
1564; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
1565; GFX9-NEXT:    s_sub_i32 s2, 0, s8
1566; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s10
1567; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1568; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1569; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1570; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1571; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1572; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1573; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1574; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
1575; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
1576; GFX9-NEXT:    s_mul_i32 s2, s2, s3
1577; GFX9-NEXT:    s_mul_hi_u32 s2, s3, s2
1578; GFX9-NEXT:    s_add_i32 s3, s3, s2
1579; GFX9-NEXT:    s_mul_hi_u32 s2, s4, s3
1580; GFX9-NEXT:    s_mul_i32 s2, s2, s8
1581; GFX9-NEXT:    s_sub_i32 s2, s4, s2
1582; GFX9-NEXT:    s_sub_i32 s3, s2, s8
1583; GFX9-NEXT:    s_cmp_ge_u32 s2, s8
1584; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
1585; GFX9-NEXT:    s_sub_i32 s3, s2, s8
1586; GFX9-NEXT:    s_cmp_ge_u32 s2, s8
1587; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
1588; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
1589; GFX9-NEXT:    s_sub_i32 s3, 0, s9
1590; GFX9-NEXT:    s_mul_i32 s3, s3, s12
1591; GFX9-NEXT:    s_mul_hi_u32 s3, s12, s3
1592; GFX9-NEXT:    s_add_i32 s12, s12, s3
1593; GFX9-NEXT:    s_mul_hi_u32 s3, s5, s12
1594; GFX9-NEXT:    s_mul_i32 s3, s3, s9
1595; GFX9-NEXT:    s_sub_i32 s3, s5, s3
1596; GFX9-NEXT:    s_sub_i32 s4, s3, s9
1597; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
1598; GFX9-NEXT:    s_cmp_ge_u32 s3, s9
1599; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
1600; GFX9-NEXT:    s_sub_i32 s4, s3, s9
1601; GFX9-NEXT:    s_cmp_ge_u32 s3, s9
1602; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s11
1603; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
1604; GFX9-NEXT:    s_sub_i32 s4, 0, s10
1605; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
1606; GFX9-NEXT:    s_mul_i32 s4, s4, s5
1607; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
1608; GFX9-NEXT:    s_add_i32 s5, s5, s4
1609; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1610; GFX9-NEXT:    s_mul_hi_u32 s4, s6, s5
1611; GFX9-NEXT:    s_mul_i32 s4, s4, s10
1612; GFX9-NEXT:    s_sub_i32 s4, s6, s4
1613; GFX9-NEXT:    s_sub_i32 s5, s4, s10
1614; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1615; GFX9-NEXT:    s_cmp_ge_u32 s4, s10
1616; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1617; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
1618; GFX9-NEXT:    s_sub_i32 s5, s4, s10
1619; GFX9-NEXT:    s_cmp_ge_u32 s4, s10
1620; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
1621; GFX9-NEXT:    s_sub_i32 s5, 0, s11
1622; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
1623; GFX9-NEXT:    s_mul_i32 s5, s5, s6
1624; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
1625; GFX9-NEXT:    s_add_i32 s6, s6, s5
1626; GFX9-NEXT:    s_mul_hi_u32 s5, s7, s6
1627; GFX9-NEXT:    s_mul_i32 s5, s5, s11
1628; GFX9-NEXT:    s_sub_i32 s5, s7, s5
1629; GFX9-NEXT:    s_sub_i32 s6, s5, s11
1630; GFX9-NEXT:    s_cmp_ge_u32 s5, s11
1631; GFX9-NEXT:    s_cselect_b32 s5, s6, s5
1632; GFX9-NEXT:    s_sub_i32 s6, s5, s11
1633; GFX9-NEXT:    s_cmp_ge_u32 s5, s11
1634; GFX9-NEXT:    s_cselect_b32 s5, s6, s5
1635; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1636; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1637; GFX9-NEXT:    v_mov_b32_e32 v2, s4
1638; GFX9-NEXT:    v_mov_b32_e32 v3, s5
1639; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1640; GFX9-NEXT:    s_endpgm
1641  %r = urem <4 x i32> %x, %y
1642  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1643  ret void
1644}
1645
1646define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1647; CHECK-LABEL: @sdiv_v4i32(
1648; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1649; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1650; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
1651; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
1652; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
1653; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
1654; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
1655; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
1656; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
1657; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
1658; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
1659; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
1660; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
1661; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
1662; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
1663; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
1664; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1665; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1666; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1667; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1668; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1669; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
1670; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
1671; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
1672; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
1673; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
1674; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
1675; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
1676; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
1677; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
1678; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
1679; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
1680; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
1681; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
1682; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
1683; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
1684; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
1685; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
1686; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
1687; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
1688; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0
1689; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1
1690; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1691; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
1692; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
1693; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
1694; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
1695; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
1696; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
1697; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
1698; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
1699; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
1700; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
1701; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
1702; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
1703; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
1704; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
1705; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
1706; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
1707; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
1708; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
1709; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
1710; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
1711; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
1712; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
1713; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
1714; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
1715; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
1716; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
1717; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
1718; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
1719; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
1720; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
1721; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
1722; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
1723; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
1724; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
1725; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
1726; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
1727; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
1728; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
1729; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1
1730; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2
1731; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1732; CHECK-NEXT:    [[TMP85:%.*]] = ashr i32 [[TMP83]], 31
1733; CHECK-NEXT:    [[TMP86:%.*]] = ashr i32 [[TMP84]], 31
1734; CHECK-NEXT:    [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]]
1735; CHECK-NEXT:    [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]]
1736; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]]
1737; CHECK-NEXT:    [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]]
1738; CHECK-NEXT:    [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]]
1739; CHECK-NEXT:    [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float
1740; CHECK-NEXT:    [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]])
1741; CHECK-NEXT:    [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000
1742; CHECK-NEXT:    [[TMP95:%.*]] = fptoui float [[TMP94]] to i32
1743; CHECK-NEXT:    [[TMP96:%.*]] = sub i32 0, [[TMP91]]
1744; CHECK-NEXT:    [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]]
1745; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP95]] to i64
1746; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
1747; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
1748; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
1749; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
1750; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
1751; CHECK-NEXT:    [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]]
1752; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP90]] to i64
1753; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1754; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1755; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1756; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1757; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1758; CHECK-NEXT:    [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]]
1759; CHECK-NEXT:    [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]]
1760; CHECK-NEXT:    [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]]
1761; CHECK-NEXT:    [[TMP114:%.*]] = add i32 [[TMP110]], 1
1762; CHECK-NEXT:    [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]]
1763; CHECK-NEXT:    [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]]
1764; CHECK-NEXT:    [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]]
1765; CHECK-NEXT:    [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]]
1766; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], 1
1767; CHECK-NEXT:    [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]]
1768; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]]
1769; CHECK-NEXT:    [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]]
1770; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2
1771; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3
1772; CHECK-NEXT:    [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1773; CHECK-NEXT:    [[TMP126:%.*]] = ashr i32 [[TMP124]], 31
1774; CHECK-NEXT:    [[TMP127:%.*]] = ashr i32 [[TMP125]], 31
1775; CHECK-NEXT:    [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]]
1776; CHECK-NEXT:    [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]]
1777; CHECK-NEXT:    [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]]
1778; CHECK-NEXT:    [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]]
1779; CHECK-NEXT:    [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]]
1780; CHECK-NEXT:    [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float
1781; CHECK-NEXT:    [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]])
1782; CHECK-NEXT:    [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000
1783; CHECK-NEXT:    [[TMP136:%.*]] = fptoui float [[TMP135]] to i32
1784; CHECK-NEXT:    [[TMP137:%.*]] = sub i32 0, [[TMP132]]
1785; CHECK-NEXT:    [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]]
1786; CHECK-NEXT:    [[TMP139:%.*]] = zext i32 [[TMP136]] to i64
1787; CHECK-NEXT:    [[TMP140:%.*]] = zext i32 [[TMP138]] to i64
1788; CHECK-NEXT:    [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]]
1789; CHECK-NEXT:    [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32
1790; CHECK-NEXT:    [[TMP143:%.*]] = lshr i64 [[TMP141]], 32
1791; CHECK-NEXT:    [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32
1792; CHECK-NEXT:    [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]]
1793; CHECK-NEXT:    [[TMP146:%.*]] = zext i32 [[TMP131]] to i64
1794; CHECK-NEXT:    [[TMP147:%.*]] = zext i32 [[TMP145]] to i64
1795; CHECK-NEXT:    [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]]
1796; CHECK-NEXT:    [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32
1797; CHECK-NEXT:    [[TMP150:%.*]] = lshr i64 [[TMP148]], 32
1798; CHECK-NEXT:    [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32
1799; CHECK-NEXT:    [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]]
1800; CHECK-NEXT:    [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]]
1801; CHECK-NEXT:    [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]]
1802; CHECK-NEXT:    [[TMP155:%.*]] = add i32 [[TMP151]], 1
1803; CHECK-NEXT:    [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]]
1804; CHECK-NEXT:    [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]]
1805; CHECK-NEXT:    [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]]
1806; CHECK-NEXT:    [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]]
1807; CHECK-NEXT:    [[TMP160:%.*]] = add i32 [[TMP156]], 1
1808; CHECK-NEXT:    [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]]
1809; CHECK-NEXT:    [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]]
1810; CHECK-NEXT:    [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]]
1811; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3
1812; CHECK-NEXT:    store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1813; CHECK-NEXT:    ret void
1814;
1815; GFX6-LABEL: sdiv_v4i32:
1816; GFX6:       ; %bb.0:
1817; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1818; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
1819; GFX6-NEXT:    s_mov_b32 s15, 0xf000
1820; GFX6-NEXT:    s_mov_b32 s14, -1
1821; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1822; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
1823; GFX6-NEXT:    s_add_i32 s3, s8, s2
1824; GFX6-NEXT:    s_xor_b32 s3, s3, s2
1825; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
1826; GFX6-NEXT:    s_ashr_i32 s8, s9, 31
1827; GFX6-NEXT:    s_add_i32 s0, s9, s8
1828; GFX6-NEXT:    s_xor_b32 s9, s0, s8
1829; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1830; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
1831; GFX6-NEXT:    s_sub_i32 s1, 0, s3
1832; GFX6-NEXT:    s_ashr_i32 s0, s4, 31
1833; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1834; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1835; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1836; GFX6-NEXT:    s_xor_b32 s2, s0, s2
1837; GFX6-NEXT:    v_mul_lo_u32 v2, s1, v0
1838; GFX6-NEXT:    s_add_i32 s1, s4, s0
1839; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1840; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1841; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
1842; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1843; GFX6-NEXT:    s_sub_i32 s0, 0, s9
1844; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1845; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
1846; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
1847; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
1848; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
1849; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
1850; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1851; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v3
1852; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1853; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v3
1854; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
1855; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
1856; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1857; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
1858; GFX6-NEXT:    s_ashr_i32 s0, s5, 31
1859; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1860; GFX6-NEXT:    s_add_i32 s1, s5, s0
1861; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
1862; GFX6-NEXT:    s_ashr_i32 s3, s10, 31
1863; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1864; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
1865; GFX6-NEXT:    s_xor_b32 s2, s0, s8
1866; GFX6-NEXT:    s_add_i32 s0, s10, s3
1867; GFX6-NEXT:    s_xor_b32 s4, s0, s3
1868; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
1869; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
1870; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1871; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s9
1872; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1873; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
1874; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
1875; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
1876; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
1877; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1878; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v2
1879; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
1880; GFX6-NEXT:    s_sub_i32 s0, 0, s4
1881; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
1882; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1883; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
1884; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1885; GFX6-NEXT:    v_mul_hi_u32 v2, v3, v5
1886; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
1887; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
1888; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
1889; GFX6-NEXT:    s_ashr_i32 s0, s6, 31
1890; GFX6-NEXT:    s_add_i32 s5, s11, s2
1891; GFX6-NEXT:    s_add_i32 s1, s6, s0
1892; GFX6-NEXT:    s_xor_b32 s5, s5, s2
1893; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1894; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1895; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s5
1896; GFX6-NEXT:    v_mul_hi_u32 v2, s1, v2
1897; GFX6-NEXT:    s_xor_b32 s3, s0, s3
1898; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
1899; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s4
1900; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
1901; GFX6-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
1902; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1903; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
1904; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v3
1905; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
1906; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v3
1907; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1908; GFX6-NEXT:    s_sub_i32 s0, 0, s5
1909; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
1910; GFX6-NEXT:    s_ashr_i32 s0, s7, 31
1911; GFX6-NEXT:    s_add_i32 s1, s7, s0
1912; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1913; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
1914; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
1915; GFX6-NEXT:    s_xor_b32 s2, s0, s2
1916; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
1917; GFX6-NEXT:    v_mul_hi_u32 v4, s1, v4
1918; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
1919; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1920; GFX6-NEXT:    v_xor_b32_e32 v2, s3, v2
1921; GFX6-NEXT:    v_mul_lo_u32 v3, v4, s5
1922; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1923; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v2
1924; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1925; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v3
1926; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
1927; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v3
1928; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1929; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1930; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
1931; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
1932; GFX6-NEXT:    v_xor_b32_e32 v3, s2, v3
1933; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v3
1934; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
1935; GFX6-NEXT:    s_endpgm
1936;
1937; GFX9-LABEL: sdiv_v4i32:
1938; GFX9:       ; %bb.0:
1939; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1940; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1941; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1942; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1943; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
1944; GFX9-NEXT:    s_add_i32 s3, s8, s2
1945; GFX9-NEXT:    s_xor_b32 s3, s3, s2
1946; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
1947; GFX9-NEXT:    s_ashr_i32 s12, s9, 31
1948; GFX9-NEXT:    s_add_i32 s9, s9, s12
1949; GFX9-NEXT:    s_xor_b32 s9, s9, s12
1950; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1951; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
1952; GFX9-NEXT:    s_sub_i32 s14, 0, s3
1953; GFX9-NEXT:    s_ashr_i32 s8, s4, 31
1954; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1955; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1956; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1957; GFX9-NEXT:    s_add_i32 s4, s4, s8
1958; GFX9-NEXT:    s_xor_b32 s4, s4, s8
1959; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v0
1960; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1961; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1962; GFX9-NEXT:    s_sub_i32 s14, 0, s9
1963; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
1964; GFX9-NEXT:    s_ashr_i32 s13, s5, 31
1965; GFX9-NEXT:    v_mul_lo_u32 v3, s14, v1
1966; GFX9-NEXT:    s_add_i32 s5, s5, s13
1967; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
1968; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
1969; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
1970; GFX9-NEXT:    s_xor_b32 s5, s5, s13
1971; GFX9-NEXT:    s_xor_b32 s2, s8, s2
1972; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s3
1973; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
1974; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
1975; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
1976; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
1977; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
1978; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1979; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v3
1980; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
1981; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
1982; GFX9-NEXT:    s_ashr_i32 s3, s10, 31
1983; GFX9-NEXT:    s_add_i32 s4, s10, s3
1984; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
1985; GFX9-NEXT:    s_xor_b32 s4, s4, s3
1986; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1987; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
1988; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s9
1989; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
1990; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
1991; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1992; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
1993; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
1994; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1995; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
1996; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
1997; GFX9-NEXT:    v_subrev_u32_e32 v5, s9, v2
1998; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1999; GFX9-NEXT:    s_sub_i32 s5, 0, s4
2000; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
2001; GFX9-NEXT:    v_mul_lo_u32 v2, s5, v3
2002; GFX9-NEXT:    s_add_i32 s9, s11, s8
2003; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
2004; GFX9-NEXT:    s_xor_b32 s9, s9, s8
2005; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
2006; GFX9-NEXT:    v_mul_hi_u32 v2, v3, v2
2007; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s9
2008; GFX9-NEXT:    s_ashr_i32 s5, s6, 31
2009; GFX9-NEXT:    s_add_i32 s6, s6, s5
2010; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
2011; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v5
2012; GFX9-NEXT:    s_xor_b32 s6, s6, s5
2013; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
2014; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
2015; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
2016; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2017; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
2018; GFX9-NEXT:    s_xor_b32 s2, s13, s12
2019; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s4
2020; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
2021; GFX9-NEXT:    v_subrev_u32_e32 v1, s2, v1
2022; GFX9-NEXT:    s_xor_b32 s2, s5, s3
2023; GFX9-NEXT:    s_sub_i32 s3, 0, s9
2024; GFX9-NEXT:    v_mul_lo_u32 v7, s3, v3
2025; GFX9-NEXT:    v_sub_u32_e32 v5, s6, v5
2026; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
2027; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
2028; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2029; GFX9-NEXT:    v_subrev_u32_e32 v6, s4, v5
2030; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
2031; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v7
2032; GFX9-NEXT:    s_ashr_i32 s3, s7, 31
2033; GFX9-NEXT:    s_add_i32 s5, s7, s3
2034; GFX9-NEXT:    s_xor_b32 s5, s5, s3
2035; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
2036; GFX9-NEXT:    v_mul_hi_u32 v3, s5, v3
2037; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
2038; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
2039; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2040; GFX9-NEXT:    v_mul_lo_u32 v5, v3, s9
2041; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
2042; GFX9-NEXT:    v_xor_b32_e32 v2, s2, v2
2043; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v2
2044; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v5
2045; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
2046; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2047; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v5
2048; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
2049; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
2050; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
2051; GFX9-NEXT:    s_xor_b32 s2, s3, s8
2052; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2053; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v3
2054; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v3
2055; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2056; GFX9-NEXT:    s_endpgm
2057  %r = sdiv <4 x i32> %x, %y
2058  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
2059  ret void
2060}
2061
2062define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
2063; CHECK-LABEL: @srem_v4i32(
2064; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
2065; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
2066; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
2067; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
2068; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
2069; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
2070; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
2071; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
2072; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
2073; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2074; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
2075; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
2076; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
2077; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
2078; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
2079; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
2080; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
2081; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
2082; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
2083; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
2084; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
2085; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
2086; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
2087; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
2088; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
2089; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
2090; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
2091; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
2092; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
2093; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
2094; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
2095; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
2096; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
2097; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
2098; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
2099; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
2100; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
2101; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0
2102; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1
2103; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1
2104; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
2105; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
2106; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
2107; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
2108; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
2109; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
2110; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
2111; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
2112; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
2113; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
2114; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
2115; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
2116; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
2117; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
2118; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
2119; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
2120; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
2121; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
2122; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
2123; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
2124; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
2125; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
2126; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
2127; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
2128; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
2129; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
2130; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
2131; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
2132; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
2133; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
2134; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
2135; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
2136; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
2137; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
2138; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
2139; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1
2140; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2
2141; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2
2142; CHECK-NEXT:    [[TMP79:%.*]] = ashr i32 [[TMP77]], 31
2143; CHECK-NEXT:    [[TMP80:%.*]] = ashr i32 [[TMP78]], 31
2144; CHECK-NEXT:    [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]]
2145; CHECK-NEXT:    [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]]
2146; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]]
2147; CHECK-NEXT:    [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]]
2148; CHECK-NEXT:    [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float
2149; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]])
2150; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000
2151; CHECK-NEXT:    [[TMP88:%.*]] = fptoui float [[TMP87]] to i32
2152; CHECK-NEXT:    [[TMP89:%.*]] = sub i32 0, [[TMP84]]
2153; CHECK-NEXT:    [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]]
2154; CHECK-NEXT:    [[TMP91:%.*]] = zext i32 [[TMP88]] to i64
2155; CHECK-NEXT:    [[TMP92:%.*]] = zext i32 [[TMP90]] to i64
2156; CHECK-NEXT:    [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]]
2157; CHECK-NEXT:    [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32
2158; CHECK-NEXT:    [[TMP95:%.*]] = lshr i64 [[TMP93]], 32
2159; CHECK-NEXT:    [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32
2160; CHECK-NEXT:    [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]]
2161; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP83]] to i64
2162; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
2163; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
2164; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
2165; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
2166; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
2167; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]]
2168; CHECK-NEXT:    [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]]
2169; CHECK-NEXT:    [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]]
2170; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]]
2171; CHECK-NEXT:    [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]]
2172; CHECK-NEXT:    [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]]
2173; CHECK-NEXT:    [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]]
2174; CHECK-NEXT:    [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]]
2175; CHECK-NEXT:    [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]]
2176; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]]
2177; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2
2178; CHECK-NEXT:    [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3
2179; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3
2180; CHECK-NEXT:    [[TMP117:%.*]] = ashr i32 [[TMP115]], 31
2181; CHECK-NEXT:    [[TMP118:%.*]] = ashr i32 [[TMP116]], 31
2182; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]]
2183; CHECK-NEXT:    [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]]
2184; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]]
2185; CHECK-NEXT:    [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]]
2186; CHECK-NEXT:    [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float
2187; CHECK-NEXT:    [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]])
2188; CHECK-NEXT:    [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000
2189; CHECK-NEXT:    [[TMP126:%.*]] = fptoui float [[TMP125]] to i32
2190; CHECK-NEXT:    [[TMP127:%.*]] = sub i32 0, [[TMP122]]
2191; CHECK-NEXT:    [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]]
2192; CHECK-NEXT:    [[TMP129:%.*]] = zext i32 [[TMP126]] to i64
2193; CHECK-NEXT:    [[TMP130:%.*]] = zext i32 [[TMP128]] to i64
2194; CHECK-NEXT:    [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]]
2195; CHECK-NEXT:    [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32
2196; CHECK-NEXT:    [[TMP133:%.*]] = lshr i64 [[TMP131]], 32
2197; CHECK-NEXT:    [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32
2198; CHECK-NEXT:    [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]]
2199; CHECK-NEXT:    [[TMP136:%.*]] = zext i32 [[TMP121]] to i64
2200; CHECK-NEXT:    [[TMP137:%.*]] = zext i32 [[TMP135]] to i64
2201; CHECK-NEXT:    [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]]
2202; CHECK-NEXT:    [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32
2203; CHECK-NEXT:    [[TMP140:%.*]] = lshr i64 [[TMP138]], 32
2204; CHECK-NEXT:    [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32
2205; CHECK-NEXT:    [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]]
2206; CHECK-NEXT:    [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]]
2207; CHECK-NEXT:    [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]]
2208; CHECK-NEXT:    [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]]
2209; CHECK-NEXT:    [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]]
2210; CHECK-NEXT:    [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]]
2211; CHECK-NEXT:    [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]]
2212; CHECK-NEXT:    [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]]
2213; CHECK-NEXT:    [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]]
2214; CHECK-NEXT:    [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]]
2215; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3
2216; CHECK-NEXT:    store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
2217; CHECK-NEXT:    ret void
2218;
2219; GFX6-LABEL: srem_v4i32:
2220; GFX6:       ; %bb.0:
2221; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
2222; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2223; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2224; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2225; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
2226; GFX6-NEXT:    s_add_i32 s8, s8, s2
2227; GFX6-NEXT:    s_xor_b32 s8, s8, s2
2228; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
2229; GFX6-NEXT:    s_ashr_i32 s13, s9, 31
2230; GFX6-NEXT:    s_add_i32 s9, s9, s13
2231; GFX6-NEXT:    s_xor_b32 s9, s9, s13
2232; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2233; GFX6-NEXT:    s_sub_i32 s14, 0, s8
2234; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
2235; GFX6-NEXT:    s_ashr_i32 s12, s4, 31
2236; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2237; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
2238; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2239; GFX6-NEXT:    s_add_i32 s4, s4, s12
2240; GFX6-NEXT:    s_xor_b32 s4, s4, s12
2241; GFX6-NEXT:    v_mul_lo_u32 v2, s14, v0
2242; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
2243; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2244; GFX6-NEXT:    s_sub_i32 s14, 0, s9
2245; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
2246; GFX6-NEXT:    s_ashr_i32 s13, s5, 31
2247; GFX6-NEXT:    s_add_i32 s5, s5, s13
2248; GFX6-NEXT:    s_xor_b32 s5, s5, s13
2249; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
2250; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
2251; GFX6-NEXT:    v_mul_lo_u32 v2, s14, v1
2252; GFX6-NEXT:    s_mov_b32 s2, -1
2253; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
2254; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
2255; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
2256; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
2257; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
2258; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2259; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
2260; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2261; GFX6-NEXT:    s_ashr_i32 s4, s10, 31
2262; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
2263; GFX6-NEXT:    s_add_i32 s8, s10, s4
2264; GFX6-NEXT:    s_xor_b32 s4, s8, s4
2265; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s4
2266; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
2267; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2268; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
2269; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2270; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
2271; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
2272; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
2273; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
2274; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
2275; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
2276; GFX6-NEXT:    s_sub_i32 s5, 0, s4
2277; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
2278; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v2
2279; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2280; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
2281; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
2282; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2283; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v4
2284; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
2285; GFX6-NEXT:    s_add_i32 s9, s11, s8
2286; GFX6-NEXT:    s_ashr_i32 s5, s6, 31
2287; GFX6-NEXT:    s_xor_b32 s8, s9, s8
2288; GFX6-NEXT:    s_add_i32 s6, s6, s5
2289; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
2290; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
2291; GFX6-NEXT:    s_xor_b32 s6, s6, s5
2292; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
2293; GFX6-NEXT:    v_xor_b32_e32 v1, s13, v1
2294; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
2295; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s13, v1
2296; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s4
2297; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
2298; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
2299; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
2300; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v2
2301; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
2302; GFX6-NEXT:    s_sub_i32 s6, 0, s8
2303; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
2304; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v3
2305; GFX6-NEXT:    s_ashr_i32 s6, s7, 31
2306; GFX6-NEXT:    s_add_i32 s7, s7, s6
2307; GFX6-NEXT:    s_xor_b32 s7, s7, s6
2308; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
2309; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v2
2310; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
2311; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
2312; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
2313; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2314; GFX6-NEXT:    v_xor_b32_e32 v2, s5, v2
2315; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s8
2316; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s5, v2
2317; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
2318; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
2319; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
2320; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2321; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
2322; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
2323; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2324; GFX6-NEXT:    v_xor_b32_e32 v3, s6, v3
2325; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v3
2326; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2327; GFX6-NEXT:    s_endpgm
2328;
2329; GFX9-LABEL: srem_v4i32:
2330; GFX9:       ; %bb.0:
2331; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
2332; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2333; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2334; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2335; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
2336; GFX9-NEXT:    s_add_i32 s3, s8, s2
2337; GFX9-NEXT:    s_xor_b32 s2, s3, s2
2338; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
2339; GFX9-NEXT:    s_sub_i32 s8, 0, s2
2340; GFX9-NEXT:    s_ashr_i32 s3, s4, 31
2341; GFX9-NEXT:    s_add_i32 s4, s4, s3
2342; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2343; GFX9-NEXT:    s_xor_b32 s4, s4, s3
2344; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2345; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2346; GFX9-NEXT:    v_readfirstlane_b32 s12, v0
2347; GFX9-NEXT:    s_mul_i32 s8, s8, s12
2348; GFX9-NEXT:    s_mul_hi_u32 s8, s12, s8
2349; GFX9-NEXT:    s_add_i32 s12, s12, s8
2350; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s12
2351; GFX9-NEXT:    s_mul_i32 s8, s8, s2
2352; GFX9-NEXT:    s_sub_i32 s4, s4, s8
2353; GFX9-NEXT:    s_sub_i32 s8, s4, s2
2354; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
2355; GFX9-NEXT:    s_cselect_b32 s4, s8, s4
2356; GFX9-NEXT:    s_sub_i32 s8, s4, s2
2357; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
2358; GFX9-NEXT:    s_cselect_b32 s2, s8, s4
2359; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
2360; GFX9-NEXT:    s_add_i32 s8, s9, s4
2361; GFX9-NEXT:    s_xor_b32 s4, s8, s4
2362; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
2363; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
2364; GFX9-NEXT:    s_xor_b32 s2, s2, s3
2365; GFX9-NEXT:    s_add_i32 s5, s5, s8
2366; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2367; GFX9-NEXT:    s_sub_i32 s2, s2, s3
2368; GFX9-NEXT:    s_xor_b32 s3, s5, s8
2369; GFX9-NEXT:    s_sub_i32 s5, 0, s4
2370; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2371; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2372; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
2373; GFX9-NEXT:    s_mul_i32 s5, s5, s9
2374; GFX9-NEXT:    s_mul_hi_u32 s5, s9, s5
2375; GFX9-NEXT:    s_add_i32 s9, s9, s5
2376; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s9
2377; GFX9-NEXT:    s_mul_i32 s5, s5, s4
2378; GFX9-NEXT:    s_sub_i32 s3, s3, s5
2379; GFX9-NEXT:    s_sub_i32 s5, s3, s4
2380; GFX9-NEXT:    s_cmp_ge_u32 s3, s4
2381; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
2382; GFX9-NEXT:    s_sub_i32 s5, s3, s4
2383; GFX9-NEXT:    s_cmp_ge_u32 s3, s4
2384; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
2385; GFX9-NEXT:    s_ashr_i32 s4, s10, 31
2386; GFX9-NEXT:    s_add_i32 s5, s10, s4
2387; GFX9-NEXT:    s_xor_b32 s4, s5, s4
2388; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
2389; GFX9-NEXT:    s_xor_b32 s3, s3, s8
2390; GFX9-NEXT:    s_sub_i32 s3, s3, s8
2391; GFX9-NEXT:    s_sub_i32 s8, 0, s4
2392; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2393; GFX9-NEXT:    s_ashr_i32 s5, s6, 31
2394; GFX9-NEXT:    s_add_i32 s6, s6, s5
2395; GFX9-NEXT:    s_xor_b32 s6, s6, s5
2396; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2397; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2398; GFX9-NEXT:    v_mov_b32_e32 v1, s3
2399; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
2400; GFX9-NEXT:    s_mul_i32 s8, s8, s9
2401; GFX9-NEXT:    s_mul_hi_u32 s8, s9, s8
2402; GFX9-NEXT:    s_add_i32 s9, s9, s8
2403; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s9
2404; GFX9-NEXT:    s_mul_i32 s8, s8, s4
2405; GFX9-NEXT:    s_sub_i32 s6, s6, s8
2406; GFX9-NEXT:    s_sub_i32 s8, s6, s4
2407; GFX9-NEXT:    s_cmp_ge_u32 s6, s4
2408; GFX9-NEXT:    s_cselect_b32 s6, s8, s6
2409; GFX9-NEXT:    s_sub_i32 s8, s6, s4
2410; GFX9-NEXT:    s_cmp_ge_u32 s6, s4
2411; GFX9-NEXT:    s_cselect_b32 s4, s8, s6
2412; GFX9-NEXT:    s_ashr_i32 s6, s11, 31
2413; GFX9-NEXT:    s_add_i32 s8, s11, s6
2414; GFX9-NEXT:    s_xor_b32 s6, s8, s6
2415; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
2416; GFX9-NEXT:    v_mov_b32_e32 v0, s2
2417; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
2418; GFX9-NEXT:    s_xor_b32 s3, s4, s5
2419; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2420; GFX9-NEXT:    s_add_i32 s4, s7, s2
2421; GFX9-NEXT:    s_sub_i32 s3, s3, s5
2422; GFX9-NEXT:    s_sub_i32 s5, 0, s6
2423; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
2424; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2425; GFX9-NEXT:    s_xor_b32 s4, s4, s2
2426; GFX9-NEXT:    v_readfirstlane_b32 s7, v2
2427; GFX9-NEXT:    s_mul_i32 s5, s5, s7
2428; GFX9-NEXT:    s_mul_hi_u32 s5, s7, s5
2429; GFX9-NEXT:    s_add_i32 s7, s7, s5
2430; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s7
2431; GFX9-NEXT:    s_mul_i32 s5, s5, s6
2432; GFX9-NEXT:    s_sub_i32 s4, s4, s5
2433; GFX9-NEXT:    s_sub_i32 s5, s4, s6
2434; GFX9-NEXT:    s_cmp_ge_u32 s4, s6
2435; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
2436; GFX9-NEXT:    s_sub_i32 s5, s4, s6
2437; GFX9-NEXT:    s_cmp_ge_u32 s4, s6
2438; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
2439; GFX9-NEXT:    s_xor_b32 s4, s4, s2
2440; GFX9-NEXT:    s_sub_i32 s2, s4, s2
2441; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2442; GFX9-NEXT:    v_mov_b32_e32 v3, s2
2443; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2444; GFX9-NEXT:    s_endpgm
2445  %r = srem <4 x i32> %x, %y
2446  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
2447  ret void
2448}
2449
2450define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2451; CHECK-LABEL: @udiv_v4i16(
2452; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2453; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2454; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2455; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2456; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2457; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2458; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2459; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2460; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2461; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2462; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2463; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2464; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2465; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2466; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2467; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2468; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2469; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
2470; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
2471; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0
2472; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1
2473; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2474; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
2475; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
2476; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
2477; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
2478; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
2479; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
2480; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
2481; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
2482; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
2483; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
2484; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
2485; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
2486; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
2487; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
2488; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
2489; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
2490; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
2491; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1
2492; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2
2493; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2494; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
2495; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
2496; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
2497; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
2498; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
2499; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
2500; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
2501; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
2502; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
2503; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
2504; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
2505; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
2506; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
2507; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
2508; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
2509; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
2510; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
2511; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2
2512; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3
2513; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2514; CHECK-NEXT:    [[TMP63:%.*]] = zext i16 [[TMP61]] to i32
2515; CHECK-NEXT:    [[TMP64:%.*]] = zext i16 [[TMP62]] to i32
2516; CHECK-NEXT:    [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float
2517; CHECK-NEXT:    [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float
2518; CHECK-NEXT:    [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]])
2519; CHECK-NEXT:    [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]]
2520; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]])
2521; CHECK-NEXT:    [[TMP70:%.*]] = fneg fast float [[TMP69]]
2522; CHECK-NEXT:    [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]])
2523; CHECK-NEXT:    [[TMP72:%.*]] = fptoui float [[TMP69]] to i32
2524; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]])
2525; CHECK-NEXT:    [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
2526; CHECK-NEXT:    [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]]
2527; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0
2528; CHECK-NEXT:    [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]]
2529; CHECK-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 65535
2530; CHECK-NEXT:    [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16
2531; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3
2532; CHECK-NEXT:    store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2533; CHECK-NEXT:    ret void
2534;
2535; GFX6-LABEL: udiv_v4i16:
2536; GFX6:       ; %bb.0:
2537; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
2538; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2539; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2540; GFX6-NEXT:    s_mov_b32 s2, -1
2541; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2542; GFX6-NEXT:    s_and_b32 s9, s6, 0xffff
2543; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
2544; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
2545; GFX6-NEXT:    s_and_b32 s8, s4, 0xffff
2546; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
2547; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
2548; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
2549; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
2550; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
2551; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
2552; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
2553; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
2554; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
2555; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2556; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
2557; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2558; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
2559; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
2560; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
2561; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
2562; GFX6-NEXT:    s_and_b32 s4, s5, 0xffff
2563; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
2564; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2565; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
2566; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2567; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
2568; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
2569; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
2570; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2571; GFX6-NEXT:    s_lshr_b32 s4, s7, 16
2572; GFX6-NEXT:    v_mad_f32 v3, -v1, v4, v5
2573; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
2574; GFX6-NEXT:    s_lshr_b32 s4, s5, 16
2575; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
2576; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2577; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
2578; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
2579; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2580; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2581; GFX6-NEXT:    v_mul_f32_e32 v3, v6, v7
2582; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
2583; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
2584; GFX6-NEXT:    v_mad_f32 v3, -v3, v5, v6
2585; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
2586; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2587; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
2588; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2589; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2590; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
2591; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
2592; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2593; GFX6-NEXT:    s_endpgm
2594;
2595; GFX9-LABEL: udiv_v4i16:
2596; GFX9:       ; %bb.0:
2597; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
2598; GFX9-NEXT:    v_mov_b32_e32 v6, 0
2599; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2600; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2601; GFX9-NEXT:    s_and_b32 s3, s6, 0xffff
2602; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
2603; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
2604; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
2605; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
2606; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
2607; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
2608; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
2609; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
2610; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
2611; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
2612; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
2613; GFX9-NEXT:    s_and_b32 s2, s7, 0xffff
2614; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
2615; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
2616; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s2
2617; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
2618; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
2619; GFX9-NEXT:    s_and_b32 s2, s5, 0xffff
2620; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
2621; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
2622; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
2623; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
2624; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
2625; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
2626; GFX9-NEXT:    s_lshr_b32 s2, s7, 16
2627; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v7
2628; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
2629; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
2630; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
2631; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2632; GFX9-NEXT:    s_lshr_b32 s2, s5, 16
2633; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s2
2634; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
2635; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
2636; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2637; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
2638; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v8
2639; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2640; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
2641; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2642; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v7
2643; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
2644; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
2645; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2646; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2647; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
2648; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
2649; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
2650; GFX9-NEXT:    s_endpgm
2651  %r = udiv <4 x i16> %x, %y
2652  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2653  ret void
2654}
2655
2656define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2657; CHECK-LABEL: @urem_v4i16(
2658; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2659; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2660; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2661; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2662; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2663; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2664; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2665; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2666; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2667; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2668; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2669; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2670; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2671; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2672; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2673; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2674; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2675; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
2676; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
2677; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
2678; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
2679; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0
2680; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1
2681; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2682; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
2683; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
2684; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
2685; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
2686; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
2687; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
2688; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
2689; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
2690; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
2691; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
2692; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2693; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
2694; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
2695; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
2696; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
2697; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
2698; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
2699; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
2700; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
2701; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1
2702; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2
2703; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2704; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
2705; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
2706; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
2707; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
2708; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
2709; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
2710; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
2711; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
2712; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
2713; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
2714; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
2715; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
2716; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
2717; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
2718; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
2719; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
2720; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
2721; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
2722; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
2723; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2
2724; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3
2725; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2726; CHECK-NEXT:    [[TMP69:%.*]] = zext i16 [[TMP67]] to i32
2727; CHECK-NEXT:    [[TMP70:%.*]] = zext i16 [[TMP68]] to i32
2728; CHECK-NEXT:    [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float
2729; CHECK-NEXT:    [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float
2730; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]])
2731; CHECK-NEXT:    [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]]
2732; CHECK-NEXT:    [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]])
2733; CHECK-NEXT:    [[TMP76:%.*]] = fneg fast float [[TMP75]]
2734; CHECK-NEXT:    [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]])
2735; CHECK-NEXT:    [[TMP78:%.*]] = fptoui float [[TMP75]] to i32
2736; CHECK-NEXT:    [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]])
2737; CHECK-NEXT:    [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]])
2738; CHECK-NEXT:    [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]]
2739; CHECK-NEXT:    [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0
2740; CHECK-NEXT:    [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]]
2741; CHECK-NEXT:    [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]]
2742; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]]
2743; CHECK-NEXT:    [[TMP86:%.*]] = and i32 [[TMP85]], 65535
2744; CHECK-NEXT:    [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16
2745; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3
2746; CHECK-NEXT:    store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2747; CHECK-NEXT:    ret void
2748;
2749; GFX6-LABEL: urem_v4i16:
2750; GFX6:       ; %bb.0:
2751; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
2752; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2753; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2754; GFX6-NEXT:    s_mov_b32 s2, -1
2755; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2756; GFX6-NEXT:    s_and_b32 s8, s6, 0xffff
2757; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
2758; GFX6-NEXT:    v_mov_b32_e32 v4, s6
2759; GFX6-NEXT:    v_alignbit_b32 v4, s7, v4, 16
2760; GFX6-NEXT:    s_and_b32 s8, s4, 0xffff
2761; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v4
2762; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s8
2763; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
2764; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, v5
2765; GFX6-NEXT:    v_mov_b32_e32 v1, s4
2766; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 16
2767; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v1
2768; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
2769; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, v6
2770; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
2771; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
2772; GFX6-NEXT:    v_mad_f32 v2, -v3, v0, v2
2773; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
2774; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
2775; GFX6-NEXT:    v_mul_f32_e32 v2, v6, v7
2776; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
2777; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
2778; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
2779; GFX6-NEXT:    v_mad_f32 v2, -v2, v5, v6
2780; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
2781; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v5
2782; GFX6-NEXT:    s_and_b32 s6, s7, 0xffff
2783; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
2784; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
2785; GFX6-NEXT:    s_and_b32 s6, s5, 0xffff
2786; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
2787; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s6
2788; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
2789; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
2790; GFX6-NEXT:    s_lshr_b32 s4, s7, 16
2791; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v2, v1
2792; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
2793; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
2794; GFX6-NEXT:    s_lshr_b32 s6, s5, 16
2795; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s6
2796; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2797; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
2798; GFX6-NEXT:    v_mad_f32 v4, -v1, v3, v4
2799; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2800; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
2801; GFX6-NEXT:    v_mul_f32_e32 v3, v6, v7
2802; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
2803; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
2804; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2805; GFX6-NEXT:    v_mad_f32 v3, -v3, v5, v6
2806; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
2807; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
2808; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
2809; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s4
2810; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2811; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2812; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
2813; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
2814; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2815; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2816; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
2817; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
2818; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2819; GFX6-NEXT:    s_endpgm
2820;
2821; GFX9-LABEL: urem_v4i16:
2822; GFX9:       ; %bb.0:
2823; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
2824; GFX9-NEXT:    v_mov_b32_e32 v6, 0
2825; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2826; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2827; GFX9-NEXT:    s_and_b32 s3, s6, 0xffff
2828; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
2829; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
2830; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
2831; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
2832; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
2833; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
2834; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
2835; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
2836; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
2837; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
2838; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
2839; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
2840; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
2841; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
2842; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
2843; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
2844; GFX9-NEXT:    s_and_b32 s3, s7, 0xffff
2845; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s3
2846; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
2847; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
2848; GFX9-NEXT:    s_and_b32 s8, s5, 0xffff
2849; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
2850; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2851; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s8
2852; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
2853; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
2854; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
2855; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v7
2856; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
2857; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
2858; GFX9-NEXT:    s_lshr_b32 s6, s7, 16
2859; GFX9-NEXT:    v_mad_f32 v3, -v2, v4, v5
2860; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s6
2861; GFX9-NEXT:    s_lshr_b32 s5, s5, 16
2862; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s5
2863; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2864; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
2865; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
2866; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
2867; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
2868; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v8
2869; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2870; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
2871; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v7
2872; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
2873; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s3
2874; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
2875; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s6
2876; GFX9-NEXT:    v_sub_u32_e32 v4, s4, v1
2877; GFX9-NEXT:    v_sub_u32_e32 v1, s8, v2
2878; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2879; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v3
2880; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2881; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
2882; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
2883; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
2884; GFX9-NEXT:    s_endpgm
2885  %r = urem <4 x i16> %x, %y
2886  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2887  ret void
2888}
2889
2890define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2891; CHECK-LABEL: @sdiv_v4i16(
2892; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2893; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2894; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2895; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2896; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2897; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2898; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2899; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2900; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2901; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2902; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2903; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2904; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2905; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2906; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2907; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2908; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2909; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2910; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2911; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2912; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
2913; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
2914; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
2915; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0
2916; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1
2917; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2918; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
2919; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
2920; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
2921; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
2922; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
2923; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
2924; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
2925; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
2926; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
2927; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
2928; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
2929; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
2930; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
2931; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
2932; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2933; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
2934; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
2935; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
2936; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
2937; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
2938; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
2939; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1
2940; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2
2941; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2942; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
2943; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
2944; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
2945; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
2946; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
2947; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
2948; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
2949; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
2950; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
2951; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
2952; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
2953; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
2954; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
2955; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
2956; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
2957; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
2958; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
2959; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
2960; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
2961; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
2962; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
2963; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2
2964; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3
2965; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2966; CHECK-NEXT:    [[TMP75:%.*]] = sext i16 [[TMP73]] to i32
2967; CHECK-NEXT:    [[TMP76:%.*]] = sext i16 [[TMP74]] to i32
2968; CHECK-NEXT:    [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]]
2969; CHECK-NEXT:    [[TMP78:%.*]] = ashr i32 [[TMP77]], 30
2970; CHECK-NEXT:    [[TMP79:%.*]] = or i32 [[TMP78]], 1
2971; CHECK-NEXT:    [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float
2972; CHECK-NEXT:    [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float
2973; CHECK-NEXT:    [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]])
2974; CHECK-NEXT:    [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]]
2975; CHECK-NEXT:    [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]])
2976; CHECK-NEXT:    [[TMP85:%.*]] = fneg fast float [[TMP84]]
2977; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]])
2978; CHECK-NEXT:    [[TMP87:%.*]] = fptosi float [[TMP84]] to i32
2979; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]])
2980; CHECK-NEXT:    [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]])
2981; CHECK-NEXT:    [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]]
2982; CHECK-NEXT:    [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0
2983; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]]
2984; CHECK-NEXT:    [[TMP93:%.*]] = shl i32 [[TMP92]], 16
2985; CHECK-NEXT:    [[TMP94:%.*]] = ashr i32 [[TMP93]], 16
2986; CHECK-NEXT:    [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16
2987; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3
2988; CHECK-NEXT:    store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2989; CHECK-NEXT:    ret void
2990;
2991; GFX6-LABEL: sdiv_v4i16:
2992; GFX6:       ; %bb.0:
2993; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
2994; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2995; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2996; GFX6-NEXT:    s_mov_b32 s2, -1
2997; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2998; GFX6-NEXT:    s_sext_i32_i16 s8, s6
2999; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
3000; GFX6-NEXT:    s_sext_i32_i16 s9, s4
3001; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
3002; GFX6-NEXT:    s_xor_b32 s8, s9, s8
3003; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3004; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
3005; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
3006; GFX6-NEXT:    s_or_b32 s8, s8, 1
3007; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3008; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3009; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3010; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3011; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3012; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
3013; GFX6-NEXT:    v_mov_b32_e32 v3, s8
3014; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3015; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
3016; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
3017; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
3018; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
3019; GFX6-NEXT:    s_xor_b32 s4, s4, s6
3020; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3021; GFX6-NEXT:    s_or_b32 s4, s4, 1
3022; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
3023; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
3024; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
3025; GFX6-NEXT:    v_mov_b32_e32 v4, s4
3026; GFX6-NEXT:    s_sext_i32_i16 s4, s7
3027; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
3028; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
3029; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
3030; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
3031; GFX6-NEXT:    s_sext_i32_i16 s6, s5
3032; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v1, v3
3033; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
3034; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3035; GFX6-NEXT:    s_xor_b32 s4, s6, s4
3036; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3037; GFX6-NEXT:    s_or_b32 s4, s4, 1
3038; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
3039; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
3040; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
3041; GFX6-NEXT:    v_mov_b32_e32 v5, s4
3042; GFX6-NEXT:    s_ashr_i32 s4, s7, 16
3043; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
3044; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
3045; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
3046; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
3047; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
3048; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
3049; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s5
3050; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
3051; GFX6-NEXT:    s_xor_b32 s4, s5, s4
3052; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3053; GFX6-NEXT:    s_or_b32 s4, s4, 1
3054; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
3055; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
3056; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
3057; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
3058; GFX6-NEXT:    v_mov_b32_e32 v6, s4
3059; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
3060; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
3061; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
3062; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3063; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3064; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3065; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
3066; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3067; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
3068; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3069; GFX6-NEXT:    s_endpgm
3070;
3071; GFX9-LABEL: sdiv_v4i16:
3072; GFX9:       ; %bb.0:
3073; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3074; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3075; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3076; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3077; GFX9-NEXT:    s_sext_i32_i16 s0, s6
3078; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
3079; GFX9-NEXT:    s_sext_i32_i16 s1, s4
3080; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s1
3081; GFX9-NEXT:    s_xor_b32 s0, s1, s0
3082; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3083; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3084; GFX9-NEXT:    s_or_b32 s8, s0, 1
3085; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
3086; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3087; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
3088; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3089; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3090; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
3091; GFX9-NEXT:    s_ashr_i32 s1, s6, 16
3092; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3093; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
3094; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s4
3095; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3096; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3097; GFX9-NEXT:    v_add_u32_e32 v3, s0, v3
3098; GFX9-NEXT:    v_mul_f32_e32 v4, v1, v4
3099; GFX9-NEXT:    s_xor_b32 s0, s4, s1
3100; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
3101; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3102; GFX9-NEXT:    v_mad_f32 v1, -v4, v0, v1
3103; GFX9-NEXT:    s_or_b32 s4, s0, 1
3104; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3105; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3106; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
3107; GFX9-NEXT:    s_sext_i32_i16 s1, s7
3108; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3109; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3110; GFX9-NEXT:    v_add_u32_e32 v4, s0, v4
3111; GFX9-NEXT:    s_sext_i32_i16 s0, s5
3112; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s0
3113; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
3114; GFX9-NEXT:    s_xor_b32 s0, s0, s1
3115; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3116; GFX9-NEXT:    s_or_b32 s4, s0, 1
3117; GFX9-NEXT:    v_mul_f32_e32 v5, v1, v5
3118; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3119; GFX9-NEXT:    v_mad_f32 v1, -v5, v0, v1
3120; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3121; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3122; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
3123; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3124; GFX9-NEXT:    s_ashr_i32 s1, s7, 16
3125; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3126; GFX9-NEXT:    v_add_u32_e32 v1, s0, v5
3127; GFX9-NEXT:    s_ashr_i32 s0, s5, 16
3128; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
3129; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v0
3130; GFX9-NEXT:    s_xor_b32 s0, s0, s1
3131; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3132; GFX9-NEXT:    s_or_b32 s4, s0, 1
3133; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
3134; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
3135; GFX9-NEXT:    v_mad_f32 v5, -v6, v0, v5
3136; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
3137; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v0|
3138; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3139; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3140; GFX9-NEXT:    v_add_u32_e32 v0, s0, v6
3141; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3142; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
3143; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v3
3144; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
3145; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
3146; GFX9-NEXT:    s_endpgm
3147  %r = sdiv <4 x i16> %x, %y
3148  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
3149  ret void
3150}
3151
3152define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
3153; CHECK-LABEL: @srem_v4i16(
3154; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
3155; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
3156; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
3157; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
3158; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3159; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3160; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3161; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3162; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3163; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3164; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3165; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3166; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3167; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3168; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3169; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3170; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3171; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3172; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3173; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3174; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
3175; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
3176; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
3177; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
3178; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
3179; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0
3180; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1
3181; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1
3182; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
3183; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
3184; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
3185; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
3186; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
3187; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
3188; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
3189; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3190; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
3191; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
3192; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
3193; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
3194; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
3195; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
3196; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
3197; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
3198; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
3199; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
3200; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
3201; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
3202; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
3203; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
3204; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
3205; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1
3206; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2
3207; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2
3208; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
3209; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
3210; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
3211; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
3212; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
3213; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
3214; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
3215; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
3216; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
3217; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
3218; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
3219; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
3220; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
3221; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3222; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
3223; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
3224; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
3225; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
3226; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
3227; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
3228; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
3229; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
3230; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
3231; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2
3232; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3
3233; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3
3234; CHECK-NEXT:    [[TMP81:%.*]] = sext i16 [[TMP79]] to i32
3235; CHECK-NEXT:    [[TMP82:%.*]] = sext i16 [[TMP80]] to i32
3236; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]]
3237; CHECK-NEXT:    [[TMP84:%.*]] = ashr i32 [[TMP83]], 30
3238; CHECK-NEXT:    [[TMP85:%.*]] = or i32 [[TMP84]], 1
3239; CHECK-NEXT:    [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float
3240; CHECK-NEXT:    [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float
3241; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]])
3242; CHECK-NEXT:    [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]]
3243; CHECK-NEXT:    [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]])
3244; CHECK-NEXT:    [[TMP91:%.*]] = fneg fast float [[TMP90]]
3245; CHECK-NEXT:    [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]])
3246; CHECK-NEXT:    [[TMP93:%.*]] = fptosi float [[TMP90]] to i32
3247; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]])
3248; CHECK-NEXT:    [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]])
3249; CHECK-NEXT:    [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]]
3250; CHECK-NEXT:    [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0
3251; CHECK-NEXT:    [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]]
3252; CHECK-NEXT:    [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]]
3253; CHECK-NEXT:    [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]]
3254; CHECK-NEXT:    [[TMP101:%.*]] = shl i32 [[TMP100]], 16
3255; CHECK-NEXT:    [[TMP102:%.*]] = ashr i32 [[TMP101]], 16
3256; CHECK-NEXT:    [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16
3257; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3
3258; CHECK-NEXT:    store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
3259; CHECK-NEXT:    ret void
3260;
3261; GFX6-LABEL: srem_v4i16:
3262; GFX6:       ; %bb.0:
3263; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
3264; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3265; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3266; GFX6-NEXT:    s_mov_b32 s2, -1
3267; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3268; GFX6-NEXT:    s_sext_i32_i16 s8, s6
3269; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
3270; GFX6-NEXT:    s_sext_i32_i16 s9, s4
3271; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
3272; GFX6-NEXT:    s_xor_b32 s8, s9, s8
3273; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3274; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
3275; GFX6-NEXT:    s_or_b32 s8, s8, 1
3276; GFX6-NEXT:    v_mov_b32_e32 v3, s8
3277; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3278; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3279; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3280; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3281; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3282; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3283; GFX6-NEXT:    v_mov_b32_e32 v1, s4
3284; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
3285; GFX6-NEXT:    v_mov_b32_e32 v2, s6
3286; GFX6-NEXT:    v_alignbit_b32 v2, s7, v2, 16
3287; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
3288; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v3
3289; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 16
3290; GFX6-NEXT:    v_bfe_i32 v5, v1, 0, 16
3291; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v5
3292; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
3293; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
3294; GFX6-NEXT:    v_xor_b32_e32 v3, v5, v3
3295; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
3296; GFX6-NEXT:    v_mul_f32_e32 v5, v6, v7
3297; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
3298; GFX6-NEXT:    v_mad_f32 v6, -v5, v4, v6
3299; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
3300; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
3301; GFX6-NEXT:    v_or_b32_e32 v3, 1, v3
3302; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
3303; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
3304; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
3305; GFX6-NEXT:    s_sext_i32_i16 s4, s7
3306; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
3307; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
3308; GFX6-NEXT:    s_sext_i32_i16 s6, s5
3309; GFX6-NEXT:    s_xor_b32 s4, s6, s4
3310; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
3311; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s6
3312; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v3
3313; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3314; GFX6-NEXT:    s_or_b32 s4, s4, 1
3315; GFX6-NEXT:    v_mov_b32_e32 v5, s4
3316; GFX6-NEXT:    v_mul_f32_e32 v4, v2, v4
3317; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
3318; GFX6-NEXT:    v_mad_f32 v2, -v4, v3, v2
3319; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
3320; GFX6-NEXT:    s_ashr_i32 s4, s7, 16
3321; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v3|
3322; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
3323; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
3324; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
3325; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s7
3326; GFX6-NEXT:    s_lshr_b32 s6, s7, 16
3327; GFX6-NEXT:    s_ashr_i32 s7, s5, 16
3328; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s7
3329; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3330; GFX6-NEXT:    s_xor_b32 s4, s7, s4
3331; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3332; GFX6-NEXT:    s_or_b32 s4, s4, 1
3333; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
3334; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
3335; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
3336; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
3337; GFX6-NEXT:    v_mov_b32_e32 v6, s4
3338; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
3339; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
3340; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
3341; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s6
3342; GFX6-NEXT:    s_lshr_b32 s4, s5, 16
3343; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
3344; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
3345; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3346; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3347; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3348; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
3349; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3350; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
3351; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3352; GFX6-NEXT:    s_endpgm
3353;
3354; GFX9-LABEL: srem_v4i16:
3355; GFX9:       ; %bb.0:
3356; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3357; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3358; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3359; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3360; GFX9-NEXT:    s_sext_i32_i16 s8, s6
3361; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
3362; GFX9-NEXT:    s_sext_i32_i16 s9, s4
3363; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
3364; GFX9-NEXT:    s_xor_b32 s0, s9, s8
3365; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3366; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3367; GFX9-NEXT:    s_or_b32 s10, s0, 1
3368; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
3369; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3370; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
3371; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3372; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3373; GFX9-NEXT:    s_cselect_b32 s0, s10, 0
3374; GFX9-NEXT:    s_ashr_i32 s6, s6, 16
3375; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3376; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s6
3377; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
3378; GFX9-NEXT:    v_add_u32_e32 v1, s0, v3
3379; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
3380; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3381; GFX9-NEXT:    s_xor_b32 s0, s4, s6
3382; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3383; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
3384; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
3385; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
3386; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
3387; GFX9-NEXT:    s_or_b32 s8, s0, 1
3388; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
3389; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
3390; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3391; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
3392; GFX9-NEXT:    s_sext_i32_i16 s8, s7
3393; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s8
3394; GFX9-NEXT:    v_add_u32_e32 v0, s0, v4
3395; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
3396; GFX9-NEXT:    s_sext_i32_i16 s6, s5
3397; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s6
3398; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3399; GFX9-NEXT:    s_xor_b32 s0, s6, s8
3400; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3401; GFX9-NEXT:    s_or_b32 s10, s0, 1
3402; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
3403; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3404; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
3405; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
3406; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3407; GFX9-NEXT:    s_cselect_b32 s0, s10, 0
3408; GFX9-NEXT:    s_ashr_i32 s7, s7, 16
3409; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
3410; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s7
3411; GFX9-NEXT:    s_ashr_i32 s5, s5, 16
3412; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
3413; GFX9-NEXT:    v_add_u32_e32 v3, s0, v5
3414; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s5
3415; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3416; GFX9-NEXT:    s_xor_b32 s0, s5, s7
3417; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3418; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s8
3419; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
3420; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
3421; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
3422; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
3423; GFX9-NEXT:    s_or_b32 s8, s0, 1
3424; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
3425; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3426; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
3427; GFX9-NEXT:    v_add_u32_e32 v4, s0, v6
3428; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s7
3429; GFX9-NEXT:    v_sub_u32_e32 v5, s9, v1
3430; GFX9-NEXT:    v_sub_u32_e32 v1, s6, v3
3431; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3432; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v4
3433; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
3434; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v5
3435; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
3436; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
3437; GFX9-NEXT:    s_endpgm
3438  %r = srem <4 x i16> %x, %y
3439  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
3440  ret void
3441}
3442
3443define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3444; CHECK-LABEL: @udiv_i3(
3445; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
3446; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
3447; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
3448; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
3449; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
3450; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
3451; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
3452; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
3453; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
3454; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
3455; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3456; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
3457; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
3458; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
3459; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
3460; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 7
3461; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3
3462; CHECK-NEXT:    store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1
3463; CHECK-NEXT:    ret void
3464;
3465; GFX6-LABEL: udiv_i3:
3466; GFX6:       ; %bb.0:
3467; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
3468; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3469; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3470; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3471; GFX6-NEXT:    s_bfe_u32 s2, s4, 0x30008
3472; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
3473; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3474; GFX6-NEXT:    s_and_b32 s4, s4, 7
3475; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
3476; GFX6-NEXT:    s_mov_b32 s2, -1
3477; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
3478; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3479; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
3480; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
3481; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3482; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
3483; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3484; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
3485; GFX6-NEXT:    s_endpgm
3486;
3487; GFX9-LABEL: udiv_i3:
3488; GFX9:       ; %bb.0:
3489; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
3490; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3491; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3492; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3493; GFX9-NEXT:    s_bfe_u32 s0, s4, 0x30008
3494; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s0
3495; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3496; GFX9-NEXT:    s_and_b32 s0, s4, 7
3497; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
3498; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
3499; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
3500; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v1
3501; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v3
3502; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3503; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
3504; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3505; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
3506; GFX9-NEXT:    s_endpgm
3507  %r = udiv i3 %x, %y
3508  store i3 %r, i3 addrspace(1)* %out
3509  ret void
3510}
3511
3512define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3513; CHECK-LABEL: @urem_i3(
3514; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
3515; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
3516; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
3517; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
3518; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
3519; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
3520; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
3521; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
3522; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
3523; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
3524; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3525; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
3526; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
3527; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
3528; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
3529; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
3530; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
3531; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 7
3532; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3
3533; CHECK-NEXT:    store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1
3534; CHECK-NEXT:    ret void
3535;
3536; GFX6-LABEL: urem_i3:
3537; GFX6:       ; %bb.0:
3538; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
3539; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3540; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3541; GFX6-NEXT:    s_bfe_u32 s2, s4, 0x30008
3542; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
3543; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3544; GFX6-NEXT:    s_and_b32 s3, s4, 7
3545; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s3
3546; GFX6-NEXT:    s_lshr_b32 s2, s4, 8
3547; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
3548; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3549; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
3550; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
3551; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3552; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3553; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
3554; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
3555; GFX6-NEXT:    s_mov_b32 s2, -1
3556; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
3557; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3558; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
3559; GFX6-NEXT:    s_endpgm
3560;
3561; GFX9-LABEL: urem_i3:
3562; GFX9:       ; %bb.0:
3563; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
3564; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3565; GFX9-NEXT:    s_bfe_u32 s3, s2, 0x30008
3566; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
3567; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3568; GFX9-NEXT:    s_and_b32 s4, s2, 7
3569; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
3570; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
3571; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
3572; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
3573; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v1
3574; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v2
3575; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3576; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3577; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
3578; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
3579; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3580; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
3581; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3582; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3583; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
3584; GFX9-NEXT:    s_endpgm
3585  %r = urem i3 %x, %y
3586  store i3 %r, i3 addrspace(1)* %out
3587  ret void
3588}
3589
3590define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3591; CHECK-LABEL: @sdiv_i3(
3592; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
3593; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
3594; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
3595; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
3596; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
3597; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
3598; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
3599; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
3600; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
3601; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
3602; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
3603; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
3604; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
3605; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
3606; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
3607; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
3608; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
3609; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
3610; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 29
3611; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 29
3612; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3
3613; CHECK-NEXT:    store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1
3614; CHECK-NEXT:    ret void
3615;
3616; GFX6-LABEL: sdiv_i3:
3617; GFX6:       ; %bb.0:
3618; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
3619; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3620; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3621; GFX6-NEXT:    s_mov_b32 s2, -1
3622; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3623; GFX6-NEXT:    s_bfe_i32 s5, s4, 0x30008
3624; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
3625; GFX6-NEXT:    s_bfe_i32 s4, s4, 0x30000
3626; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
3627; GFX6-NEXT:    s_xor_b32 s4, s4, s5
3628; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3629; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3630; GFX6-NEXT:    s_or_b32 s4, s4, 1
3631; GFX6-NEXT:    v_mov_b32_e32 v3, s4
3632; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3633; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3634; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3635; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3636; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3637; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3638; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
3639; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3640; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
3641; GFX6-NEXT:    s_endpgm
3642;
3643; GFX9-LABEL: sdiv_i3:
3644; GFX9:       ; %bb.0:
3645; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
3646; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3647; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3648; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3649; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x30008
3650; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
3651; GFX9-NEXT:    s_bfe_i32 s1, s4, 0x30000
3652; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
3653; GFX9-NEXT:    s_xor_b32 s0, s1, s0
3654; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3655; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3656; GFX9-NEXT:    s_or_b32 s4, s0, 1
3657; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
3658; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3659; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
3660; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3661; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
3662; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3663; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3664; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
3665; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3666; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
3667; GFX9-NEXT:    s_endpgm
3668  %r = sdiv i3 %x, %y
3669  store i3 %r, i3 addrspace(1)* %out
3670  ret void
3671}
3672
3673define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3674; CHECK-LABEL: @srem_i3(
3675; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
3676; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
3677; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
3678; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
3679; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
3680; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
3681; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
3682; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
3683; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
3684; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
3685; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
3686; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
3687; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
3688; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
3689; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
3690; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
3691; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
3692; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
3693; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
3694; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
3695; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 29
3696; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 29
3697; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3
3698; CHECK-NEXT:    store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1
3699; CHECK-NEXT:    ret void
3700;
3701; GFX6-LABEL: srem_i3:
3702; GFX6:       ; %bb.0:
3703; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
3704; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3705; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3706; GFX6-NEXT:    s_bfe_i32 s2, s4, 0x30008
3707; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
3708; GFX6-NEXT:    s_bfe_i32 s5, s4, 0x30000
3709; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
3710; GFX6-NEXT:    s_xor_b32 s2, s5, s2
3711; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3712; GFX6-NEXT:    s_ashr_i32 s2, s2, 30
3713; GFX6-NEXT:    s_or_b32 s2, s2, 1
3714; GFX6-NEXT:    v_mov_b32_e32 v3, s2
3715; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3716; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3717; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3718; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3719; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3720; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3721; GFX6-NEXT:    s_lshr_b32 s3, s4, 8
3722; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
3723; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
3724; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3725; GFX6-NEXT:    s_mov_b32 s2, -1
3726; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
3727; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3728; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
3729; GFX6-NEXT:    s_endpgm
3730;
3731; GFX9-LABEL: srem_i3:
3732; GFX9:       ; %bb.0:
3733; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
3734; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3735; GFX9-NEXT:    s_bfe_i32 s2, s4, 0x30008
3736; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
3737; GFX9-NEXT:    s_bfe_i32 s3, s4, 0x30000
3738; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s3
3739; GFX9-NEXT:    s_xor_b32 s2, s3, s2
3740; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3741; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
3742; GFX9-NEXT:    s_lshr_b32 s5, s4, 8
3743; GFX9-NEXT:    s_or_b32 s6, s2, 1
3744; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
3745; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
3746; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
3747; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
3748; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
3749; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
3750; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
3751; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
3752; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
3753; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3754; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3755; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
3756; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3757; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3758; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
3759; GFX9-NEXT:    s_endpgm
3760  %r = srem i3 %x, %y
3761  store i3 %r, i3 addrspace(1)* %out
3762  ret void
3763}
3764
3765define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
3766; CHECK-LABEL: @udiv_v3i16(
3767; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3768; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3769; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3770; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3771; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3772; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3773; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3774; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3775; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3776; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3777; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3778; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3779; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3780; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3781; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3782; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3783; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3784; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
3785; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
3786; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0
3787; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1
3788; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3789; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
3790; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
3791; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
3792; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
3793; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
3794; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
3795; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
3796; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
3797; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
3798; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
3799; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
3800; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
3801; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
3802; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
3803; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
3804; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
3805; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
3806; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1
3807; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2
3808; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3809; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
3810; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
3811; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
3812; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
3813; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
3814; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
3815; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
3816; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
3817; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
3818; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
3819; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
3820; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
3821; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
3822; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
3823; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
3824; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
3825; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
3826; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2
3827; CHECK-NEXT:    store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
3828; CHECK-NEXT:    ret void
3829;
3830; GFX6-LABEL: udiv_v3i16:
3831; GFX6:       ; %bb.0:
3832; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
3833; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3834; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3835; GFX6-NEXT:    s_mov_b32 s2, -1
3836; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3837; GFX6-NEXT:    s_and_b32 s9, s6, 0xffff
3838; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
3839; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
3840; GFX6-NEXT:    s_and_b32 s8, s4, 0xffff
3841; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
3842; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
3843; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3844; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
3845; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
3846; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
3847; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
3848; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
3849; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
3850; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3851; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
3852; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3853; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
3854; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
3855; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
3856; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
3857; GFX6-NEXT:    s_and_b32 s4, s5, 0xffff
3858; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
3859; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
3860; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3861; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
3862; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
3863; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3864; GFX6-NEXT:    v_mul_f32_e32 v2, v5, v6
3865; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3866; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
3867; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3868; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v5
3869; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
3870; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
3871; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3872; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3873; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
3874; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3875; GFX6-NEXT:    s_endpgm
3876;
3877; GFX9-LABEL: udiv_v3i16:
3878; GFX9:       ; %bb.0:
3879; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3880; GFX9-NEXT:    v_mov_b32_e32 v6, 0
3881; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3882; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3883; GFX9-NEXT:    s_and_b32 s3, s6, 0xffff
3884; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
3885; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
3886; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
3887; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
3888; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
3889; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3890; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
3891; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
3892; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
3893; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
3894; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
3895; GFX9-NEXT:    s_and_b32 s2, s7, 0xffff
3896; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
3897; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
3898; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s2
3899; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
3900; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
3901; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
3902; GFX9-NEXT:    s_and_b32 s2, s5, 0xffff
3903; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
3904; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
3905; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
3906; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
3907; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
3908; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
3909; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
3910; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v7
3911; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
3912; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
3913; GFX9-NEXT:    v_mad_f32 v2, -v2, v4, v5
3914; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
3915; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3916; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
3917; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
3918; GFX9-NEXT:    global_store_short v6, v2, s[0:1] offset:4
3919; GFX9-NEXT:    global_store_dword v6, v0, s[0:1]
3920; GFX9-NEXT:    s_endpgm
3921  %r = udiv <3 x i16> %x, %y
3922  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
3923  ret void
3924}
3925
3926define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
3927; CHECK-LABEL: @urem_v3i16(
3928; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3929; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3930; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3931; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3932; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3933; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3934; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3935; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3936; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3937; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3938; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3939; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3940; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3941; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3942; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3943; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3944; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3945; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
3946; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
3947; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
3948; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
3949; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0
3950; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1
3951; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3952; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
3953; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
3954; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
3955; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
3956; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
3957; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
3958; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
3959; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
3960; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
3961; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
3962; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3963; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
3964; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
3965; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
3966; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
3967; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
3968; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
3969; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
3970; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
3971; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1
3972; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2
3973; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3974; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
3975; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
3976; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
3977; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
3978; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
3979; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
3980; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
3981; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
3982; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
3983; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
3984; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
3985; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
3986; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
3987; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
3988; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
3989; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
3990; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
3991; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
3992; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
3993; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2
3994; CHECK-NEXT:    store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
3995; CHECK-NEXT:    ret void
3996;
3997; GFX6-LABEL: urem_v3i16:
3998; GFX6:       ; %bb.0:
3999; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
4000; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4001; GFX6-NEXT:    s_mov_b32 s3, 0xf000
4002; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4003; GFX6-NEXT:    s_and_b32 s8, s6, 0xffff
4004; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
4005; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
4006; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
4007; GFX6-NEXT:    v_mov_b32_e32 v2, s6
4008; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
4009; GFX6-NEXT:    v_alignbit_b32 v2, s7, v2, 16
4010; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v2
4011; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, v5
4012; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4013; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4014; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v4
4015; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
4016; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
4017; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4018; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v6, vcc
4019; GFX6-NEXT:    v_alignbit_b32 v0, s5, v0, 16
4020; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s6
4021; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v0
4022; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
4023; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
4024; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
4025; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
4026; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
4027; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4028; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4029; GFX6-NEXT:    v_mad_f32 v3, -v4, v5, v3
4030; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
4031; GFX6-NEXT:    s_and_b32 s4, s5, 0xffff
4032; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s4
4033; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
4034; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
4035; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
4036; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
4037; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
4038; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4039; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
4040; GFX6-NEXT:    v_mad_f32 v3, -v3, v6, v7
4041; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v6
4042; GFX6-NEXT:    s_mov_b32 s2, -1
4043; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
4044; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s7
4045; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
4046; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
4047; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v3
4048; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4049; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
4050; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
4051; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4052; GFX6-NEXT:    s_endpgm
4053;
4054; GFX9-LABEL: urem_v3i16:
4055; GFX9:       ; %bb.0:
4056; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
4057; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4058; GFX9-NEXT:    s_and_b32 s3, s6, 0xffff
4059; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
4060; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
4061; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
4062; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
4063; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
4064; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
4065; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
4066; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
4067; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
4068; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4069; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
4070; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v4
4071; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
4072; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
4073; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
4074; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
4075; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4076; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
4077; GFX9-NEXT:    s_and_b32 s3, s7, 0xffff
4078; GFX9-NEXT:    v_mad_f32 v2, -v5, v1, v3
4079; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s3
4080; GFX9-NEXT:    s_and_b32 s5, s5, 0xffff
4081; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v5
4082; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s5
4083; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
4084; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
4085; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
4086; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v6
4087; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
4088; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
4089; GFX9-NEXT:    v_mad_f32 v2, -v2, v3, v5
4090; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
4091; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4092; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
4093; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
4094; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s3
4095; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
4096; GFX9-NEXT:    v_mov_b32_e32 v3, 0
4097; GFX9-NEXT:    v_sub_u32_e32 v1, s4, v1
4098; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
4099; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4100; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
4101; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4102; GFX9-NEXT:    global_store_short v3, v2, s[0:1] offset:4
4103; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
4104; GFX9-NEXT:    s_endpgm
4105  %r = urem <3 x i16> %x, %y
4106  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
4107  ret void
4108}
4109
4110define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
4111; CHECK-LABEL: @sdiv_v3i16(
4112; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
4113; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
4114; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
4115; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
4116; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4117; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4118; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4119; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4120; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4121; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4122; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4123; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4124; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4125; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4126; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4127; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4128; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4129; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4130; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4131; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4132; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
4133; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
4134; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
4135; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0
4136; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1
4137; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1
4138; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
4139; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
4140; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
4141; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
4142; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
4143; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
4144; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
4145; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4146; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
4147; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
4148; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
4149; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
4150; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
4151; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
4152; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4153; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
4154; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
4155; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
4156; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
4157; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
4158; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
4159; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1
4160; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2
4161; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2
4162; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
4163; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
4164; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
4165; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
4166; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
4167; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
4168; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
4169; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
4170; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
4171; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
4172; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
4173; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
4174; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
4175; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
4176; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
4177; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
4178; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
4179; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
4180; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
4181; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
4182; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
4183; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2
4184; CHECK-NEXT:    store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
4185; CHECK-NEXT:    ret void
4186;
4187; GFX6-LABEL: sdiv_v3i16:
4188; GFX6:       ; %bb.0:
4189; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
4190; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4191; GFX6-NEXT:    s_mov_b32 s3, 0xf000
4192; GFX6-NEXT:    s_mov_b32 s2, -1
4193; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4194; GFX6-NEXT:    s_sext_i32_i16 s8, s6
4195; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
4196; GFX6-NEXT:    s_sext_i32_i16 s9, s4
4197; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
4198; GFX6-NEXT:    s_xor_b32 s8, s9, s8
4199; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4200; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
4201; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
4202; GFX6-NEXT:    s_or_b32 s8, s8, 1
4203; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
4204; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4205; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
4206; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
4207; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
4208; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
4209; GFX6-NEXT:    v_mov_b32_e32 v3, s8
4210; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
4211; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
4212; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
4213; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
4214; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
4215; GFX6-NEXT:    s_xor_b32 s4, s4, s6
4216; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
4217; GFX6-NEXT:    s_or_b32 s4, s4, 1
4218; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
4219; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4220; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
4221; GFX6-NEXT:    v_mov_b32_e32 v4, s4
4222; GFX6-NEXT:    s_sext_i32_i16 s4, s7
4223; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
4224; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
4225; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
4226; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
4227; GFX6-NEXT:    s_sext_i32_i16 s5, s5
4228; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
4229; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
4230; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
4231; GFX6-NEXT:    s_xor_b32 s4, s5, s4
4232; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
4233; GFX6-NEXT:    s_or_b32 s4, s4, 1
4234; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4235; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4236; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
4237; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
4238; GFX6-NEXT:    v_mov_b32_e32 v5, s4
4239; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
4240; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
4241; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
4242; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4243; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4244; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
4245; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
4246; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4247; GFX6-NEXT:    s_endpgm
4248;
4249; GFX9-LABEL: sdiv_v3i16:
4250; GFX9:       ; %bb.0:
4251; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
4252; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4253; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4254; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4255; GFX9-NEXT:    s_sext_i32_i16 s0, s6
4256; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
4257; GFX9-NEXT:    s_sext_i32_i16 s1, s4
4258; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
4259; GFX9-NEXT:    s_xor_b32 s0, s1, s0
4260; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4261; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4262; GFX9-NEXT:    s_or_b32 s8, s0, 1
4263; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
4264; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4265; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
4266; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
4267; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4268; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
4269; GFX9-NEXT:    s_ashr_i32 s1, s6, 16
4270; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
4271; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
4272; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
4273; GFX9-NEXT:    v_add_u32_e32 v2, s0, v3
4274; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
4275; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
4276; GFX9-NEXT:    s_xor_b32 s0, s4, s1
4277; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4278; GFX9-NEXT:    s_or_b32 s4, s0, 1
4279; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
4280; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4281; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
4282; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
4283; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4284; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
4285; GFX9-NEXT:    s_sext_i32_i16 s1, s7
4286; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
4287; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
4288; GFX9-NEXT:    v_add_u32_e32 v3, s0, v4
4289; GFX9-NEXT:    s_sext_i32_i16 s0, s5
4290; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
4291; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
4292; GFX9-NEXT:    s_xor_b32 s0, s0, s1
4293; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4294; GFX9-NEXT:    s_or_b32 s4, s0, 1
4295; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4296; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4297; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
4298; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
4299; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
4300; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4301; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
4302; GFX9-NEXT:    v_add_u32_e32 v0, s0, v5
4303; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
4304; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
4305; GFX9-NEXT:    global_store_short v1, v0, s[2:3] offset:4
4306; GFX9-NEXT:    global_store_dword v1, v2, s[2:3]
4307; GFX9-NEXT:    s_endpgm
4308  %r = sdiv <3 x i16> %x, %y
4309  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
4310  ret void
4311}
4312
4313define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
4314; CHECK-LABEL: @srem_v3i16(
4315; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
4316; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
4317; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
4318; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
4319; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4320; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4321; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4322; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4323; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4324; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4325; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4326; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4327; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4328; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4329; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4330; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4331; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4332; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4333; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4334; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4335; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
4336; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
4337; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
4338; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
4339; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
4340; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0
4341; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1
4342; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1
4343; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
4344; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
4345; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
4346; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
4347; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
4348; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
4349; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
4350; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
4351; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
4352; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
4353; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
4354; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
4355; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
4356; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
4357; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
4358; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
4359; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
4360; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
4361; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
4362; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
4363; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
4364; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
4365; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
4366; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1
4367; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2
4368; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2
4369; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
4370; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
4371; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
4372; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
4373; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
4374; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
4375; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
4376; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
4377; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
4378; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
4379; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
4380; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
4381; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
4382; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
4383; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
4384; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
4385; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
4386; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
4387; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
4388; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
4389; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
4390; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
4391; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
4392; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2
4393; CHECK-NEXT:    store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
4394; CHECK-NEXT:    ret void
4395;
4396; GFX6-LABEL: srem_v3i16:
4397; GFX6:       ; %bb.0:
4398; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
4399; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4400; GFX6-NEXT:    s_mov_b32 s3, 0xf000
4401; GFX6-NEXT:    s_mov_b32 s2, -1
4402; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4403; GFX6-NEXT:    s_sext_i32_i16 s8, s6
4404; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
4405; GFX6-NEXT:    s_sext_i32_i16 s9, s4
4406; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
4407; GFX6-NEXT:    s_xor_b32 s8, s9, s8
4408; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4409; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
4410; GFX6-NEXT:    s_or_b32 s8, s8, 1
4411; GFX6-NEXT:    v_mov_b32_e32 v3, s8
4412; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
4413; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4414; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
4415; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
4416; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
4417; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
4418; GFX6-NEXT:    v_mov_b32_e32 v1, s4
4419; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
4420; GFX6-NEXT:    v_mov_b32_e32 v2, s6
4421; GFX6-NEXT:    v_alignbit_b32 v2, s7, v2, 16
4422; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
4423; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v3
4424; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 16
4425; GFX6-NEXT:    v_bfe_i32 v5, v1, 0, 16
4426; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v5
4427; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
4428; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
4429; GFX6-NEXT:    v_xor_b32_e32 v3, v5, v3
4430; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
4431; GFX6-NEXT:    v_mul_f32_e32 v5, v6, v7
4432; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
4433; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
4434; GFX6-NEXT:    v_mad_f32 v6, -v5, v4, v6
4435; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
4436; GFX6-NEXT:    s_sext_i32_i16 s4, s7
4437; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
4438; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s4
4439; GFX6-NEXT:    v_or_b32_e32 v3, 1, v3
4440; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
4441; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
4442; GFX6-NEXT:    s_sext_i32_i16 s6, s5
4443; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
4444; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s6
4445; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v4
4446; GFX6-NEXT:    s_xor_b32 s4, s6, s4
4447; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
4448; GFX6-NEXT:    s_or_b32 s4, s4, 1
4449; GFX6-NEXT:    v_mul_f32_e32 v5, v3, v5
4450; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
4451; GFX6-NEXT:    v_mad_f32 v3, -v5, v4, v3
4452; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
4453; GFX6-NEXT:    v_mov_b32_e32 v6, s4
4454; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
4455; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
4456; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
4457; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s7
4458; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
4459; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4460; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v3
4461; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4462; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
4463; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
4464; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4465; GFX6-NEXT:    s_endpgm
4466;
4467; GFX9-LABEL: srem_v3i16:
4468; GFX9:       ; %bb.0:
4469; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
4470; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4471; GFX9-NEXT:    s_sext_i32_i16 s8, s6
4472; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
4473; GFX9-NEXT:    s_sext_i32_i16 s9, s4
4474; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
4475; GFX9-NEXT:    s_xor_b32 s2, s9, s8
4476; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4477; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
4478; GFX9-NEXT:    s_or_b32 s10, s2, 1
4479; GFX9-NEXT:    s_sext_i32_i16 s7, s7
4480; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
4481; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
4482; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
4483; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
4484; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
4485; GFX9-NEXT:    s_cselect_b32 s2, s10, 0
4486; GFX9-NEXT:    s_ashr_i32 s6, s6, 16
4487; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
4488; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s6
4489; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
4490; GFX9-NEXT:    s_sext_i32_i16 s5, s5
4491; GFX9-NEXT:    v_add_u32_e32 v1, s2, v2
4492; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s4
4493; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4494; GFX9-NEXT:    s_xor_b32 s2, s4, s6
4495; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
4496; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
4497; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
4498; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4499; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
4500; GFX9-NEXT:    s_or_b32 s8, s2, 1
4501; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
4502; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
4503; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s7
4504; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
4505; GFX9-NEXT:    s_cselect_b32 s2, s8, 0
4506; GFX9-NEXT:    v_add_u32_e32 v0, s2, v3
4507; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s5
4508; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
4509; GFX9-NEXT:    s_xor_b32 s2, s5, s7
4510; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
4511; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
4512; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
4513; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4514; GFX9-NEXT:    v_mad_f32 v3, -v4, v2, v3
4515; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
4516; GFX9-NEXT:    s_or_b32 s6, s2, 1
4517; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v3|, |v2|
4518; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
4519; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
4520; GFX9-NEXT:    v_add_u32_e32 v2, s2, v4
4521; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4522; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s7
4523; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
4524; GFX9-NEXT:    v_mov_b32_e32 v3, 0
4525; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
4526; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
4527; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4528; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
4529; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4530; GFX9-NEXT:    global_store_short v3, v2, s[0:1] offset:4
4531; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
4532; GFX9-NEXT:    s_endpgm
4533  %r = srem <3 x i16> %x, %y
4534  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
4535  ret void
4536}
4537
4538define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
4539; CHECK-LABEL: @udiv_v3i15(
4540; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4541; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4542; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
4543; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
4544; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
4545; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
4546; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
4547; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
4548; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
4549; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
4550; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
4551; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
4552; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
4553; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
4554; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
4555; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
4556; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
4557; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 32767
4558; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15
4559; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0
4560; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1
4561; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4562; CHECK-NEXT:    [[TMP23:%.*]] = zext i15 [[TMP21]] to i32
4563; CHECK-NEXT:    [[TMP24:%.*]] = zext i15 [[TMP22]] to i32
4564; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
4565; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
4566; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
4567; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
4568; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
4569; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
4570; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
4571; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
4572; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
4573; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
4574; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
4575; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
4576; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
4577; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 32767
4578; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15
4579; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1
4580; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2
4581; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4582; CHECK-NEXT:    [[TMP43:%.*]] = zext i15 [[TMP41]] to i32
4583; CHECK-NEXT:    [[TMP44:%.*]] = zext i15 [[TMP42]] to i32
4584; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
4585; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
4586; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
4587; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
4588; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
4589; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
4590; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
4591; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
4592; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
4593; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
4594; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
4595; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
4596; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
4597; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 32767
4598; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15
4599; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2
4600; CHECK-NEXT:    store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
4601; CHECK-NEXT:    ret void
4602;
4603; GFX6-LABEL: udiv_v3i15:
4604; GFX6:       ; %bb.0:
4605; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4606; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4607; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4608; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4609; GFX6-NEXT:    s_mov_b32 s6, -1
4610; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4611; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4612; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4613; GFX6-NEXT:    s_and_b32 s8, s0, 0x7fff
4614; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
4615; GFX6-NEXT:    s_and_b32 s3, s2, 0x7fff
4616; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4617; GFX6-NEXT:    s_bfe_u32 s0, s0, 0xf000f
4618; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s3
4619; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
4620; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s0
4621; GFX6-NEXT:    s_bfe_u32 s2, s2, 0xf000f
4622; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
4623; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4624; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s2
4625; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
4626; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
4627; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4628; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
4629; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
4630; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v2
4631; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
4632; GFX6-NEXT:    v_mul_f32_e32 v1, v6, v7
4633; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
4634; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4635; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
4636; GFX6-NEXT:    v_mad_f32 v4, -v1, v5, v6
4637; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
4638; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
4639; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v2
4640; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
4641; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
4642; GFX6-NEXT:    v_mul_f32_e32 v1, v0, v6
4643; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4644; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v1
4645; GFX6-NEXT:    v_mad_f32 v0, -v1, v2, v0
4646; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
4647; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v3
4648; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
4649; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
4650; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
4651; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
4652; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
4653; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
4654; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4655; GFX6-NEXT:    s_waitcnt expcnt(0)
4656; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4657; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
4658; GFX6-NEXT:    s_endpgm
4659;
4660; GFX9-LABEL: udiv_v3i15:
4661; GFX9:       ; %bb.0:
4662; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
4663; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
4664; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4665; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
4666; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4667; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4668; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4669; GFX9-NEXT:    s_and_b32 s6, s2, 0x7fff
4670; GFX9-NEXT:    s_and_b32 s3, s0, 0x7fff
4671; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
4672; GFX9-NEXT:    v_mov_b32_e32 v3, s0
4673; GFX9-NEXT:    s_bfe_u32 s0, s0, 0xf000f
4674; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
4675; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
4676; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
4677; GFX9-NEXT:    s_bfe_u32 s2, s2, 0xf000f
4678; GFX9-NEXT:    v_alignbit_b32 v3, s1, v3, 30
4679; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4680; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s2
4681; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
4682; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
4683; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4684; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
4685; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
4686; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
4687; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
4688; GFX9-NEXT:    v_mul_f32_e32 v1, v7, v8
4689; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
4690; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
4691; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
4692; GFX9-NEXT:    v_mad_f32 v5, -v1, v6, v7
4693; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
4694; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
4695; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
4696; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v6
4697; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
4698; GFX9-NEXT:    v_mul_f32_e32 v1, v0, v7
4699; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
4700; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v1
4701; GFX9-NEXT:    v_mad_f32 v0, -v1, v3, v0
4702; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v3
4703; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
4704; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
4705; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
4706; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
4707; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
4708; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
4709; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
4710; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
4711; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4712; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
4713; GFX9-NEXT:    s_endpgm
4714  %r = udiv <3 x i15> %x, %y
4715  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
4716  ret void
4717}
4718
4719define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
4720; CHECK-LABEL: @urem_v3i15(
4721; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4722; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4723; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
4724; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
4725; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
4726; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
4727; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
4728; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
4729; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
4730; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
4731; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
4732; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
4733; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
4734; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
4735; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
4736; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
4737; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
4738; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
4739; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
4740; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 32767
4741; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15
4742; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0
4743; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1
4744; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4745; CHECK-NEXT:    [[TMP25:%.*]] = zext i15 [[TMP23]] to i32
4746; CHECK-NEXT:    [[TMP26:%.*]] = zext i15 [[TMP24]] to i32
4747; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
4748; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
4749; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
4750; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
4751; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
4752; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
4753; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
4754; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
4755; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4756; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
4757; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
4758; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
4759; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
4760; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
4761; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
4762; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 32767
4763; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15
4764; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1
4765; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2
4766; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4767; CHECK-NEXT:    [[TMP47:%.*]] = zext i15 [[TMP45]] to i32
4768; CHECK-NEXT:    [[TMP48:%.*]] = zext i15 [[TMP46]] to i32
4769; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
4770; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
4771; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
4772; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
4773; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
4774; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
4775; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
4776; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
4777; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
4778; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
4779; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
4780; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
4781; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
4782; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
4783; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
4784; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 32767
4785; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15
4786; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2
4787; CHECK-NEXT:    store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
4788; CHECK-NEXT:    ret void
4789;
4790; GFX6-LABEL: urem_v3i15:
4791; GFX6:       ; %bb.0:
4792; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4793; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4794; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4795; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4796; GFX6-NEXT:    s_mov_b32 s6, -1
4797; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4798; GFX6-NEXT:    s_and_b32 s8, s2, 0x7fff
4799; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
4800; GFX6-NEXT:    s_and_b32 s9, s0, 0x7fff
4801; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
4802; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4803; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
4804; GFX6-NEXT:    s_bfe_u32 s1, s0, 0xf000f
4805; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
4806; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s1
4807; GFX6-NEXT:    s_bfe_u32 s9, s2, 0xf000f
4808; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
4809; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4810; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4811; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
4812; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
4813; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
4814; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s9
4815; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4816; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
4817; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
4818; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
4819; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4820; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
4821; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s2, v1
4822; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
4823; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v2
4824; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, v0
4825; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4826; GFX6-NEXT:    v_mad_f32 v3, -v1, v5, v3
4827; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v4
4828; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
4829; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
4830; GFX6-NEXT:    s_lshr_b32 s0, s0, 15
4831; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
4832; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4833; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v3
4834; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4835; GFX6-NEXT:    v_mad_f32 v3, -v3, v4, v7
4836; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
4837; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
4838; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
4839; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
4840; GFX6-NEXT:    s_lshr_b32 s3, s2, 15
4841; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s3, v1
4842; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
4843; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
4844; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
4845; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v6
4846; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
4847; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
4848; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
4849; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4850; GFX6-NEXT:    s_waitcnt expcnt(0)
4851; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4852; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
4853; GFX6-NEXT:    s_endpgm
4854;
4855; GFX9-LABEL: urem_v3i15:
4856; GFX9:       ; %bb.0:
4857; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
4858; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
4859; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4860; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
4861; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4862; GFX9-NEXT:    s_and_b32 s6, s2, 0x7fff
4863; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
4864; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4865; GFX9-NEXT:    s_and_b32 s7, s0, 0x7fff
4866; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
4867; GFX9-NEXT:    s_bfe_u32 s6, s0, 0xf000f
4868; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s6
4869; GFX9-NEXT:    v_mov_b32_e32 v3, s0
4870; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
4871; GFX9-NEXT:    v_alignbit_b32 v3, s1, v3, 30
4872; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4873; GFX9-NEXT:    s_bfe_u32 s3, s2, 0xf000f
4874; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4875; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4876; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
4877; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
4878; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
4879; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
4880; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s3
4881; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
4882; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
4883; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
4884; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
4885; GFX9-NEXT:    v_mul_f32_e32 v4, v7, v8
4886; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
4887; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v5
4888; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4889; GFX9-NEXT:    v_mad_f32 v7, -v4, v6, v7
4890; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
4891; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, v6
4892; GFX9-NEXT:    v_mul_f32_e32 v6, v8, v9
4893; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
4894; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
4895; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
4896; GFX9-NEXT:    v_mad_f32 v6, -v6, v5, v8
4897; GFX9-NEXT:    s_lshr_b32 s1, s0, 15
4898; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
4899; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s1
4900; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
4901; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s0
4902; GFX9-NEXT:    v_mul_lo_u32 v3, v5, v3
4903; GFX9-NEXT:    s_lshr_b32 s0, s2, 15
4904; GFX9-NEXT:    v_sub_u32_e32 v4, s0, v4
4905; GFX9-NEXT:    v_sub_u32_e32 v5, s2, v1
4906; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
4907; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
4908; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
4909; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v5
4910; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
4911; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
4912; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
4913; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
4914; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4915; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
4916; GFX9-NEXT:    s_endpgm
4917  %r = urem <3 x i15> %x, %y
4918  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
4919  ret void
4920}
4921
4922define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
4923; CHECK-LABEL: @sdiv_v3i15(
4924; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4925; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4926; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
4927; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
4928; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4929; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4930; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4931; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4932; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4933; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4934; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4935; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4936; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4937; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4938; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4939; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4940; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4941; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4942; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4943; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4944; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 17
4945; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 17
4946; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15
4947; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0
4948; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1
4949; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4950; CHECK-NEXT:    [[TMP27:%.*]] = sext i15 [[TMP25]] to i32
4951; CHECK-NEXT:    [[TMP28:%.*]] = sext i15 [[TMP26]] to i32
4952; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
4953; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
4954; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
4955; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
4956; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
4957; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4958; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
4959; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
4960; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
4961; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
4962; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
4963; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
4964; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4965; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
4966; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
4967; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
4968; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 17
4969; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 17
4970; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15
4971; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1
4972; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2
4973; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4974; CHECK-NEXT:    [[TMP51:%.*]] = sext i15 [[TMP49]] to i32
4975; CHECK-NEXT:    [[TMP52:%.*]] = sext i15 [[TMP50]] to i32
4976; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
4977; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
4978; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
4979; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
4980; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
4981; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
4982; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
4983; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
4984; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
4985; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
4986; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
4987; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
4988; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
4989; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
4990; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
4991; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
4992; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 17
4993; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 17
4994; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15
4995; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2
4996; CHECK-NEXT:    store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
4997; CHECK-NEXT:    ret void
4998;
4999; GFX6-LABEL: sdiv_v3i15:
5000; GFX6:       ; %bb.0:
5001; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
5002; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5003; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
5004; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5005; GFX6-NEXT:    s_mov_b32 s6, -1
5006; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5007; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5008; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
5009; GFX6-NEXT:    s_bfe_i32 s3, s0, 0xf0000
5010; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s3
5011; GFX6-NEXT:    v_mov_b32_e32 v1, s0
5012; GFX6-NEXT:    v_alignbit_b32 v1, s1, v1, 30
5013; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf0000
5014; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s1
5015; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
5016; GFX6-NEXT:    s_xor_b32 s1, s1, s3
5017; GFX6-NEXT:    s_bfe_i32 s0, s0, 0xf000f
5018; GFX6-NEXT:    s_ashr_i32 s1, s1, 30
5019; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
5020; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
5021; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
5022; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
5023; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
5024; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s0
5025; GFX6-NEXT:    s_or_b32 s1, s1, 1
5026; GFX6-NEXT:    v_mov_b32_e32 v5, s1
5027; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
5028; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf000f
5029; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5030; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s1
5031; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
5032; GFX6-NEXT:    s_xor_b32 s0, s1, s0
5033; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 15
5034; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
5035; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
5036; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
5037; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
5038; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
5039; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
5040; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v1
5041; GFX6-NEXT:    s_or_b32 s0, s0, 1
5042; GFX6-NEXT:    v_mov_b32_e32 v6, s0
5043; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
5044; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
5045; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
5046; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v0
5047; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
5048; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v1
5049; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
5050; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
5051; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
5052; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
5053; GFX6-NEXT:    v_mad_f32 v5, -v1, v4, v5
5054; GFX6-NEXT:    v_cvt_i32_f32_e32 v1, v1
5055; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
5056; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
5057; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
5058; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5059; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
5060; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
5061; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
5062; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
5063; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
5064; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5065; GFX6-NEXT:    s_waitcnt expcnt(0)
5066; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5067; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
5068; GFX6-NEXT:    s_endpgm
5069;
5070; GFX9-LABEL: sdiv_v3i15:
5071; GFX9:       ; %bb.0:
5072; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5073; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
5074; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
5075; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5076; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5077; GFX9-NEXT:    s_bfe_i32 s1, s2, 0xf0000
5078; GFX9-NEXT:    s_bfe_i32 s0, s4, 0xf0000
5079; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
5080; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s1
5081; GFX9-NEXT:    s_xor_b32 s0, s1, s0
5082; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5083; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
5084; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5085; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
5086; GFX9-NEXT:    s_or_b32 s3, s0, 1
5087; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
5088; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
5089; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
5090; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
5091; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5092; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
5093; GFX9-NEXT:    s_cselect_b32 s0, s3, 0
5094; GFX9-NEXT:    s_bfe_i32 s1, s4, 0xf000f
5095; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s1
5096; GFX9-NEXT:    v_add_u32_e32 v4, s0, v5
5097; GFX9-NEXT:    s_bfe_i32 s0, s2, 0xf000f
5098; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
5099; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
5100; GFX9-NEXT:    v_mov_b32_e32 v1, s4
5101; GFX9-NEXT:    v_alignbit_b32 v1, s5, v1, 30
5102; GFX9-NEXT:    s_xor_b32 s0, s0, s1
5103; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
5104; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
5105; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5106; GFX9-NEXT:    v_mad_f32 v5, -v6, v3, v5
5107; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
5108; GFX9-NEXT:    s_or_b32 s2, s0, 1
5109; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
5110; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v3|
5111; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
5112; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5113; GFX9-NEXT:    s_cselect_b32 s0, s2, 0
5114; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 15
5115; GFX9-NEXT:    v_add_u32_e32 v5, s0, v6
5116; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, v0
5117; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
5118; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
5119; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
5120; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
5121; GFX9-NEXT:    v_mul_f32_e32 v1, v6, v7
5122; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
5123; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v1
5124; GFX9-NEXT:    v_mad_f32 v1, -v1, v3, v6
5125; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
5126; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
5127; GFX9-NEXT:    v_add_u32_e32 v0, v7, v0
5128; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
5129; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
5130; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
5131; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
5132; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
5133; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
5134; GFX9-NEXT:    global_store_dword v2, v0, s[6:7]
5135; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5136; GFX9-NEXT:    global_store_short v2, v0, s[6:7] offset:4
5137; GFX9-NEXT:    s_endpgm
5138  %r = sdiv <3 x i15> %x, %y
5139  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
5140  ret void
5141}
5142
5143define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
5144; CHECK-LABEL: @srem_v3i15(
5145; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
5146; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
5147; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
5148; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
5149; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
5150; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
5151; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
5152; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
5153; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
5154; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
5155; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
5156; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
5157; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
5158; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
5159; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
5160; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
5161; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
5162; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
5163; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
5164; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
5165; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
5166; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
5167; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 17
5168; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 17
5169; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15
5170; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0
5171; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1
5172; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1
5173; CHECK-NEXT:    [[TMP29:%.*]] = sext i15 [[TMP27]] to i32
5174; CHECK-NEXT:    [[TMP30:%.*]] = sext i15 [[TMP28]] to i32
5175; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
5176; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
5177; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
5178; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
5179; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
5180; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
5181; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
5182; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
5183; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
5184; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
5185; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
5186; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
5187; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
5188; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
5189; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
5190; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
5191; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
5192; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
5193; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 17
5194; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 17
5195; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15
5196; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1
5197; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2
5198; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2
5199; CHECK-NEXT:    [[TMP55:%.*]] = sext i15 [[TMP53]] to i32
5200; CHECK-NEXT:    [[TMP56:%.*]] = sext i15 [[TMP54]] to i32
5201; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
5202; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
5203; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
5204; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
5205; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
5206; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
5207; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
5208; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
5209; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
5210; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
5211; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
5212; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
5213; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
5214; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
5215; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
5216; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
5217; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
5218; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
5219; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 17
5220; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 17
5221; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15
5222; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2
5223; CHECK-NEXT:    store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
5224; CHECK-NEXT:    ret void
5225;
5226; GFX6-LABEL: srem_v3i15:
5227; GFX6:       ; %bb.0:
5228; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
5229; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5230; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
5231; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5232; GFX6-NEXT:    s_mov_b32 s6, -1
5233; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5234; GFX6-NEXT:    s_bfe_i32 s9, s2, 0xf0000
5235; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s9
5236; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5237; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
5238; GFX6-NEXT:    s_bfe_i32 s1, s0, 0xf0000
5239; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s1
5240; GFX6-NEXT:    s_xor_b32 s1, s9, s1
5241; GFX6-NEXT:    s_ashr_i32 s1, s1, 30
5242; GFX6-NEXT:    s_or_b32 s1, s1, 1
5243; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
5244; GFX6-NEXT:    v_mov_b32_e32 v7, s1
5245; GFX6-NEXT:    s_lshr_b32 s8, s0, 15
5246; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf000f
5247; GFX6-NEXT:    v_mul_f32_e32 v6, v5, v6
5248; GFX6-NEXT:    v_trunc_f32_e32 v6, v6
5249; GFX6-NEXT:    v_mad_f32 v5, -v6, v4, v5
5250; GFX6-NEXT:    v_cvt_i32_f32_e32 v6, v6
5251; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
5252; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc
5253; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v2
5254; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
5255; GFX6-NEXT:    v_mul_lo_u32 v4, v4, s0
5256; GFX6-NEXT:    s_bfe_i32 s0, s0, 0xf000f
5257; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s0
5258; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, s1
5259; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s2, v4
5260; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
5261; GFX6-NEXT:    s_xor_b32 s0, s1, s0
5262; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 15
5263; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
5264; GFX6-NEXT:    v_mul_f32_e32 v7, v6, v7
5265; GFX6-NEXT:    v_trunc_f32_e32 v7, v7
5266; GFX6-NEXT:    v_mad_f32 v6, -v7, v5, v6
5267; GFX6-NEXT:    v_cvt_i32_f32_e32 v7, v7
5268; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v5|
5269; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v2
5270; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5271; GFX6-NEXT:    s_or_b32 s0, s0, 1
5272; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
5273; GFX6-NEXT:    v_mov_b32_e32 v8, s0
5274; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
5275; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v8, vcc
5276; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
5277; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
5278; GFX6-NEXT:    v_cvt_f32_i32_e32 v7, v0
5279; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
5280; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v2
5281; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
5282; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
5283; GFX6-NEXT:    v_mul_f32_e32 v2, v7, v8
5284; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
5285; GFX6-NEXT:    v_mad_f32 v7, -v2, v6, v7
5286; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
5287; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
5288; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
5289; GFX6-NEXT:    v_mul_lo_u32 v5, v5, s8
5290; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5291; GFX6-NEXT:    v_mul_lo_u32 v0, v0, v3
5292; GFX6-NEXT:    s_lshr_b32 s3, s2, 15
5293; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v5
5294; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v1
5295; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
5296; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
5297; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
5298; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 15, v2
5299; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
5300; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
5301; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5302; GFX6-NEXT:    s_waitcnt expcnt(0)
5303; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5304; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
5305; GFX6-NEXT:    s_endpgm
5306;
5307; GFX9-LABEL: srem_v3i15:
5308; GFX9:       ; %bb.0:
5309; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5310; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
5311; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
5312; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5313; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5314; GFX9-NEXT:    s_bfe_i32 s1, s2, 0xf0000
5315; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s1
5316; GFX9-NEXT:    s_bfe_i32 s0, s6, 0xf0000
5317; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
5318; GFX9-NEXT:    s_xor_b32 s0, s1, s0
5319; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5320; GFX9-NEXT:    v_mov_b32_e32 v1, s6
5321; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
5322; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5323; GFX9-NEXT:    s_lshr_b32 s8, s2, 15
5324; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
5325; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
5326; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
5327; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
5328; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
5329; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
5330; GFX9-NEXT:    s_lshr_b32 s3, s6, 15
5331; GFX9-NEXT:    s_or_b32 s7, s0, 1
5332; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
5333; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5334; GFX9-NEXT:    s_cselect_b32 s0, s7, 0
5335; GFX9-NEXT:    v_add_u32_e32 v4, s0, v6
5336; GFX9-NEXT:    s_bfe_i32 s0, s6, 0xf000f
5337; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
5338; GFX9-NEXT:    s_bfe_i32 s1, s2, 0xf000f
5339; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, s1
5340; GFX9-NEXT:    s_xor_b32 s0, s1, s0
5341; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
5342; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v1
5343; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5344; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
5345; GFX9-NEXT:    v_mul_f32_e32 v7, v6, v7
5346; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
5347; GFX9-NEXT:    v_mad_f32 v6, -v7, v5, v6
5348; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v7
5349; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s6
5350; GFX9-NEXT:    s_or_b32 s6, s0, 1
5351; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v6|, |v5|
5352; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, v1
5353; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5354; GFX9-NEXT:    s_cselect_b32 s0, s6, 0
5355; GFX9-NEXT:    v_add_u32_e32 v5, s0, v7
5356; GFX9-NEXT:    v_bfe_i32 v7, v0, 0, 15
5357; GFX9-NEXT:    v_cvt_f32_i32_e32 v8, v7
5358; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v6
5359; GFX9-NEXT:    v_xor_b32_e32 v1, v7, v1
5360; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
5361; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1
5362; GFX9-NEXT:    v_mul_f32_e32 v7, v8, v9
5363; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
5364; GFX9-NEXT:    v_cvt_i32_f32_e32 v9, v7
5365; GFX9-NEXT:    v_mad_f32 v7, -v7, v6, v8
5366; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
5367; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5368; GFX9-NEXT:    v_mul_lo_u32 v5, v5, s3
5369; GFX9-NEXT:    v_add_u32_e32 v1, v9, v1
5370; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
5371; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
5372; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v4
5373; GFX9-NEXT:    v_sub_u32_e32 v4, s8, v5
5374; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
5375; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
5376; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
5377; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
5378; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
5379; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
5380; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
5381; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
5382; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5383; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
5384; GFX9-NEXT:    s_endpgm
5385  %r = srem <3 x i15> %x, %y
5386  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
5387  ret void
5388}
5389
5390define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
5391; CHECK-LABEL: @udiv_i32_oddk_denom(
5392; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 1235195
5393; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5394; CHECK-NEXT:    ret void
5395;
5396; GFX6-LABEL: udiv_i32_oddk_denom:
5397; GFX6:       ; %bb.0:
5398; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
5399; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5400; GFX6-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
5401; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5402; GFX6-NEXT:    s_mov_b32 s2, -1
5403; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5404; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
5405; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v0
5406; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5407; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5408; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
5409; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5410; GFX6-NEXT:    s_endpgm
5411;
5412; GFX9-LABEL: udiv_i32_oddk_denom:
5413; GFX9:       ; %bb.0:
5414; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5415; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5416; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5417; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5418; GFX9-NEXT:    s_mul_hi_u32 s0, s4, 0xb2a50881
5419; GFX9-NEXT:    s_sub_i32 s1, s4, s0
5420; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
5421; GFX9-NEXT:    s_add_i32 s1, s1, s0
5422; GFX9-NEXT:    s_lshr_b32 s0, s1, 20
5423; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5424; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5425; GFX9-NEXT:    s_endpgm
5426  %r = udiv i32 %x, 1235195
5427  store i32 %r, i32 addrspace(1)* %out
5428  ret void
5429}
5430
5431define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
5432; CHECK-LABEL: @udiv_i32_pow2k_denom(
5433; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 4096
5434; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5435; CHECK-NEXT:    ret void
5436;
5437; GFX6-LABEL: udiv_i32_pow2k_denom:
5438; GFX6:       ; %bb.0:
5439; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
5440; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5441; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5442; GFX6-NEXT:    s_mov_b32 s2, -1
5443; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5444; GFX6-NEXT:    s_lshr_b32 s4, s4, 12
5445; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5446; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5447; GFX6-NEXT:    s_endpgm
5448;
5449; GFX9-LABEL: udiv_i32_pow2k_denom:
5450; GFX9:       ; %bb.0:
5451; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5452; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5453; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5454; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5455; GFX9-NEXT:    s_lshr_b32 s0, s4, 12
5456; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5457; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5458; GFX9-NEXT:    s_endpgm
5459  %r = udiv i32 %x, 4096
5460  store i32 %r, i32 addrspace(1)* %out
5461  ret void
5462}
5463
5464define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
5465; CHECK-LABEL: @udiv_i32_pow2_shl_denom(
5466; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
5467; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]]
5468; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5469; CHECK-NEXT:    ret void
5470;
5471; GFX6-LABEL: udiv_i32_pow2_shl_denom:
5472; GFX6:       ; %bb.0:
5473; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
5474; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5475; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5476; GFX6-NEXT:    s_mov_b32 s2, -1
5477; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5478; GFX6-NEXT:    s_add_i32 s5, s5, 12
5479; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
5480; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5481; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5482; GFX6-NEXT:    s_endpgm
5483;
5484; GFX9-LABEL: udiv_i32_pow2_shl_denom:
5485; GFX9:       ; %bb.0:
5486; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5487; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
5488; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5489; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5490; GFX9-NEXT:    s_add_i32 s0, s3, 12
5491; GFX9-NEXT:    s_lshr_b32 s0, s2, s0
5492; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5493; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
5494; GFX9-NEXT:    s_endpgm
5495  %shl.y = shl i32 4096, %y
5496  %r = udiv i32 %x, %shl.y
5497  store i32 %r, i32 addrspace(1)* %out
5498  ret void
5499}
5500
5501define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
5502; CHECK-LABEL: @udiv_v2i32_pow2k_denom(
5503; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5504; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
5505; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
5506; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5507; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096
5508; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5509; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5510; CHECK-NEXT:    ret void
5511;
5512; GFX6-LABEL: udiv_v2i32_pow2k_denom:
5513; GFX6:       ; %bb.0:
5514; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
5515; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5516; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5517; GFX6-NEXT:    s_mov_b32 s2, -1
5518; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5519; GFX6-NEXT:    s_lshr_b32 s4, s4, 12
5520; GFX6-NEXT:    s_lshr_b32 s5, s5, 12
5521; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5522; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5523; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5524; GFX6-NEXT:    s_endpgm
5525;
5526; GFX9-LABEL: udiv_v2i32_pow2k_denom:
5527; GFX9:       ; %bb.0:
5528; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5529; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
5530; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5531; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5532; GFX9-NEXT:    s_lshr_b32 s0, s2, 12
5533; GFX9-NEXT:    s_lshr_b32 s1, s3, 12
5534; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5535; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5536; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
5537; GFX9-NEXT:    s_endpgm
5538  %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
5539  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5540  ret void
5541}
5542
5543define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
5544; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom(
5545; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5546; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
5547; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
5548; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5549; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095
5550; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5551; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5552; CHECK-NEXT:    ret void
5553;
5554; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom:
5555; GFX6:       ; %bb.0:
5556; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
5557; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5558; GFX6-NEXT:    v_mov_b32_e32 v0, 0x100101
5559; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5560; GFX6-NEXT:    s_mov_b32 s2, -1
5561; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5562; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
5563; GFX6-NEXT:    s_lshr_b32 s4, s4, 12
5564; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v0
5565; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5566; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5567; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 11, v0
5568; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5569; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5570; GFX6-NEXT:    s_endpgm
5571;
5572; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
5573; GFX9:       ; %bb.0:
5574; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5575; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
5576; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5577; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5578; GFX9-NEXT:    s_mul_hi_u32 s1, s3, 0x100101
5579; GFX9-NEXT:    s_lshr_b32 s0, s2, 12
5580; GFX9-NEXT:    s_sub_i32 s2, s3, s1
5581; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
5582; GFX9-NEXT:    s_add_i32 s2, s2, s1
5583; GFX9-NEXT:    s_lshr_b32 s1, s2, 11
5584; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5585; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5586; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
5587; GFX9-NEXT:    s_endpgm
5588  %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
5589  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5590  ret void
5591}
5592
5593define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
5594; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom(
5595; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
5596; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5597; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
5598; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
5599; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
5600; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
5601; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
5602; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
5603; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
5604; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
5605; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
5606; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
5607; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
5608; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
5609; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
5610; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
5611; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
5612; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
5613; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
5614; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
5615; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
5616; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
5617; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
5618; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
5619; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
5620; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
5621; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
5622; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
5623; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
5624; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
5625; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
5626; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
5627; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0
5628; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1
5629; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
5630; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
5631; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
5632; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
5633; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
5634; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
5635; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
5636; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
5637; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
5638; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
5639; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
5640; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
5641; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
5642; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
5643; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
5644; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
5645; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
5646; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
5647; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
5648; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
5649; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
5650; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
5651; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
5652; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
5653; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
5654; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
5655; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
5656; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
5657; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
5658; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
5659; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1
5660; CHECK-NEXT:    store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5661; CHECK-NEXT:    ret void
5662;
5663; GFX6-LABEL: udiv_v2i32_pow2_shl_denom:
5664; GFX6:       ; %bb.0:
5665; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
5666; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
5667; GFX6-NEXT:    s_mov_b32 s11, 0xf000
5668; GFX6-NEXT:    s_mov_b32 s10, -1
5669; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5670; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s6
5671; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
5672; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s7
5673; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
5674; GFX6-NEXT:    s_sub_i32 s0, 0, s2
5675; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
5676; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
5677; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
5678; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
5679; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
5680; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
5681; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v0
5682; GFX6-NEXT:    s_sub_i32 s0, 0, s3
5683; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
5684; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
5685; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
5686; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
5687; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
5688; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
5689; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
5690; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
5691; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
5692; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
5693; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
5694; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
5695; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
5696; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v2
5697; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
5698; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
5699; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
5700; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
5701; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v4
5702; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
5703; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v2
5704; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
5705; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v2
5706; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
5707; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
5708; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
5709; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
5710; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
5711; GFX6-NEXT:    s_endpgm
5712;
5713; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
5714; GFX9:       ; %bb.0:
5715; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
5716; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5717; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s6
5718; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s7
5719; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
5720; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
5721; GFX9-NEXT:    s_sub_i32 s2, 0, s6
5722; GFX9-NEXT:    s_sub_i32 s3, 0, s7
5723; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
5724; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
5725; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
5726; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
5727; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
5728; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
5729; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
5730; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
5731; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5732; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
5733; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
5734; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
5735; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
5736; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
5737; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
5738; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5739; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s6
5740; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s7
5741; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
5742; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
5743; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
5744; GFX9-NEXT:    v_sub_u32_e32 v4, s5, v4
5745; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
5746; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
5747; GFX9-NEXT:    v_subrev_u32_e32 v5, s6, v3
5748; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v4
5749; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
5750; GFX9-NEXT:    v_subrev_u32_e32 v6, s7, v4
5751; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
5752; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
5753; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
5754; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
5755; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
5756; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
5757; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v4
5758; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
5759; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5760; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
5761; GFX9-NEXT:    s_endpgm
5762  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
5763  %r = udiv <2 x i32> %x, %shl.y
5764  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5765  ret void
5766}
5767
5768define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
5769; CHECK-LABEL: @urem_i32_oddk_denom(
5770; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 1235195
5771; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5772; CHECK-NEXT:    ret void
5773;
5774; GFX6-LABEL: urem_i32_oddk_denom:
5775; GFX6:       ; %bb.0:
5776; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
5777; GFX6-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
5778; GFX6-NEXT:    s_mov_b32 s2, 0x12d8fb
5779; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5780; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5781; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5782; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
5783; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v0
5784; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5785; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
5786; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
5787; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
5788; GFX6-NEXT:    s_mov_b32 s2, -1
5789; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
5790; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5791; GFX6-NEXT:    s_endpgm
5792;
5793; GFX9-LABEL: urem_i32_oddk_denom:
5794; GFX9:       ; %bb.0:
5795; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5796; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5797; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5798; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5799; GFX9-NEXT:    s_mul_hi_u32 s0, s4, 0xb2a50881
5800; GFX9-NEXT:    s_sub_i32 s1, s4, s0
5801; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
5802; GFX9-NEXT:    s_add_i32 s1, s1, s0
5803; GFX9-NEXT:    s_lshr_b32 s0, s1, 20
5804; GFX9-NEXT:    s_mul_i32 s0, s0, 0x12d8fb
5805; GFX9-NEXT:    s_sub_i32 s0, s4, s0
5806; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5807; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5808; GFX9-NEXT:    s_endpgm
5809  %r = urem i32 %x, 1235195
5810  store i32 %r, i32 addrspace(1)* %out
5811  ret void
5812}
5813
5814define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
5815; CHECK-LABEL: @urem_i32_pow2k_denom(
5816; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 4096
5817; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5818; CHECK-NEXT:    ret void
5819;
5820; GFX6-LABEL: urem_i32_pow2k_denom:
5821; GFX6:       ; %bb.0:
5822; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
5823; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5824; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5825; GFX6-NEXT:    s_mov_b32 s2, -1
5826; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5827; GFX6-NEXT:    s_and_b32 s4, s4, 0xfff
5828; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5829; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5830; GFX6-NEXT:    s_endpgm
5831;
5832; GFX9-LABEL: urem_i32_pow2k_denom:
5833; GFX9:       ; %bb.0:
5834; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5835; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5836; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5837; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5838; GFX9-NEXT:    s_and_b32 s0, s4, 0xfff
5839; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5840; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5841; GFX9-NEXT:    s_endpgm
5842  %r = urem i32 %x, 4096
5843  store i32 %r, i32 addrspace(1)* %out
5844  ret void
5845}
5846
5847define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
5848; CHECK-LABEL: @urem_i32_pow2_shl_denom(
5849; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
5850; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]]
5851; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5852; CHECK-NEXT:    ret void
5853;
5854; GFX6-LABEL: urem_i32_pow2_shl_denom:
5855; GFX6:       ; %bb.0:
5856; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
5857; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5858; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5859; GFX6-NEXT:    s_mov_b32 s2, -1
5860; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5861; GFX6-NEXT:    s_lshl_b32 s5, 0x1000, s5
5862; GFX6-NEXT:    s_add_i32 s5, s5, -1
5863; GFX6-NEXT:    s_and_b32 s4, s4, s5
5864; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5865; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5866; GFX6-NEXT:    s_endpgm
5867;
5868; GFX9-LABEL: urem_i32_pow2_shl_denom:
5869; GFX9:       ; %bb.0:
5870; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5871; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
5872; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5873; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5874; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s3
5875; GFX9-NEXT:    s_add_i32 s0, s0, -1
5876; GFX9-NEXT:    s_and_b32 s0, s2, s0
5877; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5878; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
5879; GFX9-NEXT:    s_endpgm
5880  %shl.y = shl i32 4096, %y
5881  %r = urem i32 %x, %shl.y
5882  store i32 %r, i32 addrspace(1)* %out
5883  ret void
5884}
5885
5886define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
5887; CHECK-LABEL: @urem_v2i32_pow2k_denom(
5888; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5889; CHECK-NEXT:    [[TMP2:%.*]] = urem i32 [[TMP1]], 4096
5890; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
5891; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5892; CHECK-NEXT:    [[TMP5:%.*]] = urem i32 [[TMP4]], 4096
5893; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5894; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5895; CHECK-NEXT:    ret void
5896;
5897; GFX6-LABEL: urem_v2i32_pow2k_denom:
5898; GFX6:       ; %bb.0:
5899; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
5900; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5901; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5902; GFX6-NEXT:    s_mov_b32 s2, -1
5903; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5904; GFX6-NEXT:    s_and_b32 s4, s4, 0xfff
5905; GFX6-NEXT:    s_and_b32 s5, s5, 0xfff
5906; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5907; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5908; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5909; GFX6-NEXT:    s_endpgm
5910;
5911; GFX9-LABEL: urem_v2i32_pow2k_denom:
5912; GFX9:       ; %bb.0:
5913; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5914; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
5915; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5916; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5917; GFX9-NEXT:    s_and_b32 s0, s2, 0xfff
5918; GFX9-NEXT:    s_and_b32 s1, s3, 0xfff
5919; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5920; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5921; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
5922; GFX9-NEXT:    s_endpgm
5923  %r = urem <2 x i32> %x, <i32 4096, i32 4096>
5924  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5925  ret void
5926}
5927
5928define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
5929; CHECK-LABEL: @urem_v2i32_pow2_shl_denom(
5930; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
5931; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5932; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
5933; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
5934; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
5935; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
5936; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
5937; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
5938; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
5939; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
5940; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
5941; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
5942; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
5943; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
5944; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
5945; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
5946; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
5947; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
5948; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
5949; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
5950; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
5951; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
5952; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
5953; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
5954; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
5955; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
5956; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
5957; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
5958; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
5959; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
5960; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0
5961; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1
5962; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
5963; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
5964; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
5965; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
5966; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
5967; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
5968; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
5969; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
5970; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
5971; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
5972; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
5973; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
5974; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
5975; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
5976; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
5977; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
5978; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
5979; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
5980; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
5981; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
5982; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
5983; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
5984; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
5985; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
5986; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
5987; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
5988; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
5989; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
5990; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1
5991; CHECK-NEXT:    store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5992; CHECK-NEXT:    ret void
5993;
5994; GFX6-LABEL: urem_v2i32_pow2_shl_denom:
5995; GFX6:       ; %bb.0:
5996; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
5997; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5998; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5999; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6000; GFX6-NEXT:    s_lshl_b32 s6, 0x1000, s6
6001; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
6002; GFX6-NEXT:    s_lshl_b32 s7, 0x1000, s7
6003; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
6004; GFX6-NEXT:    s_sub_i32 s2, 0, s6
6005; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6006; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6007; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6008; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6009; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
6010; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
6011; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
6012; GFX6-NEXT:    s_sub_i32 s2, 0, s7
6013; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
6014; GFX6-NEXT:    s_mov_b32 s2, -1
6015; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
6016; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
6017; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
6018; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
6019; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6020; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
6021; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
6022; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
6023; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
6024; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
6025; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
6026; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
6027; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6028; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
6029; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
6030; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6031; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
6032; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
6033; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6034; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
6035; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
6036; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6037; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6038; GFX6-NEXT:    s_endpgm
6039;
6040; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
6041; GFX9:       ; %bb.0:
6042; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
6043; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6044; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6045; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6046; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s6
6047; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
6048; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s7
6049; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
6050; GFX9-NEXT:    s_sub_i32 s6, 0, s3
6051; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6052; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6053; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6054; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6055; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
6056; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
6057; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
6058; GFX9-NEXT:    s_mul_i32 s6, s6, s7
6059; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
6060; GFX9-NEXT:    s_add_i32 s7, s7, s6
6061; GFX9-NEXT:    s_mul_hi_u32 s6, s4, s7
6062; GFX9-NEXT:    s_mul_i32 s6, s6, s3
6063; GFX9-NEXT:    s_sub_i32 s4, s4, s6
6064; GFX9-NEXT:    s_sub_i32 s6, s4, s3
6065; GFX9-NEXT:    s_cmp_ge_u32 s4, s3
6066; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
6067; GFX9-NEXT:    s_sub_i32 s6, s4, s3
6068; GFX9-NEXT:    s_cmp_ge_u32 s4, s3
6069; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
6070; GFX9-NEXT:    s_cselect_b32 s3, s6, s4
6071; GFX9-NEXT:    s_sub_i32 s4, 0, s2
6072; GFX9-NEXT:    s_mul_i32 s4, s4, s8
6073; GFX9-NEXT:    s_mul_hi_u32 s4, s8, s4
6074; GFX9-NEXT:    s_add_i32 s8, s8, s4
6075; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s8
6076; GFX9-NEXT:    s_mul_i32 s4, s4, s2
6077; GFX9-NEXT:    s_sub_i32 s4, s5, s4
6078; GFX9-NEXT:    s_sub_i32 s5, s4, s2
6079; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
6080; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
6081; GFX9-NEXT:    s_sub_i32 s5, s4, s2
6082; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
6083; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
6084; GFX9-NEXT:    v_mov_b32_e32 v0, s3
6085; GFX9-NEXT:    v_mov_b32_e32 v1, s2
6086; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
6087; GFX9-NEXT:    s_endpgm
6088  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
6089  %r = urem <2 x i32> %x, %shl.y
6090  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6091  ret void
6092}
6093
6094define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
6095; CHECK-LABEL: @sdiv_i32_oddk_denom(
6096; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195
6097; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6098; CHECK-NEXT:    ret void
6099;
6100; GFX6-LABEL: sdiv_i32_oddk_denom:
6101; GFX6:       ; %bb.0:
6102; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
6103; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6104; GFX6-NEXT:    v_mov_b32_e32 v0, 0xd9528441
6105; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6106; GFX6-NEXT:    s_mov_b32 s2, -1
6107; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6108; GFX6-NEXT:    v_mul_hi_i32 v0, s4, v0
6109; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
6110; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
6111; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
6112; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6113; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6114; GFX6-NEXT:    s_endpgm
6115;
6116; GFX9-LABEL: sdiv_i32_oddk_denom:
6117; GFX9:       ; %bb.0:
6118; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6119; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6120; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6121; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6122; GFX9-NEXT:    s_mul_hi_i32 s0, s4, 0xd9528441
6123; GFX9-NEXT:    s_add_i32 s0, s0, s4
6124; GFX9-NEXT:    s_lshr_b32 s1, s0, 31
6125; GFX9-NEXT:    s_ashr_i32 s0, s0, 20
6126; GFX9-NEXT:    s_add_i32 s0, s0, s1
6127; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6128; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6129; GFX9-NEXT:    s_endpgm
6130  %r = sdiv i32 %x, 1235195
6131  store i32 %r, i32 addrspace(1)* %out
6132  ret void
6133}
6134
6135define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
6136; CHECK-LABEL: @sdiv_i32_pow2k_denom(
6137; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 4096
6138; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6139; CHECK-NEXT:    ret void
6140;
6141; GFX6-LABEL: sdiv_i32_pow2k_denom:
6142; GFX6:       ; %bb.0:
6143; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
6144; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6145; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6146; GFX6-NEXT:    s_mov_b32 s2, -1
6147; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6148; GFX6-NEXT:    s_ashr_i32 s5, s4, 31
6149; GFX6-NEXT:    s_lshr_b32 s5, s5, 20
6150; GFX6-NEXT:    s_add_i32 s4, s4, s5
6151; GFX6-NEXT:    s_ashr_i32 s4, s4, 12
6152; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6153; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6154; GFX6-NEXT:    s_endpgm
6155;
6156; GFX9-LABEL: sdiv_i32_pow2k_denom:
6157; GFX9:       ; %bb.0:
6158; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6159; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6160; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6161; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6162; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
6163; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6164; GFX9-NEXT:    s_add_i32 s4, s4, s0
6165; GFX9-NEXT:    s_ashr_i32 s0, s4, 12
6166; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6167; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6168; GFX9-NEXT:    s_endpgm
6169  %r = sdiv i32 %x, 4096
6170  store i32 %r, i32 addrspace(1)* %out
6171  ret void
6172}
6173
6174define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
6175; CHECK-LABEL: @sdiv_i32_pow2_shl_denom(
6176; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
6177; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]]
6178; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6179; CHECK-NEXT:    ret void
6180;
6181; GFX6-LABEL: sdiv_i32_pow2_shl_denom:
6182; GFX6:       ; %bb.0:
6183; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
6184; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6185; GFX6-NEXT:    s_mov_b32 s6, -1
6186; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6187; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
6188; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
6189; GFX6-NEXT:    s_add_i32 s3, s3, s8
6190; GFX6-NEXT:    s_xor_b32 s3, s3, s8
6191; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
6192; GFX6-NEXT:    s_sub_i32 s4, 0, s3
6193; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6194; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6195; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6196; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
6197; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6198; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
6199; GFX6-NEXT:    s_add_i32 s1, s2, s0
6200; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6201; GFX6-NEXT:    s_xor_b32 s1, s1, s0
6202; GFX6-NEXT:    s_xor_b32 s2, s0, s8
6203; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6204; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
6205; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
6206; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
6207; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
6208; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
6209; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
6210; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
6211; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
6212; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
6213; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
6214; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6215; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
6216; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
6217; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6218; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6219; GFX6-NEXT:    s_endpgm
6220;
6221; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
6222; GFX9:       ; %bb.0:
6223; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6224; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6225; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6226; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6227; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
6228; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
6229; GFX9-NEXT:    s_add_i32 s3, s3, s4
6230; GFX9-NEXT:    s_xor_b32 s3, s3, s4
6231; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
6232; GFX9-NEXT:    s_sub_i32 s5, 0, s3
6233; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6234; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6235; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6236; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
6237; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
6238; GFX9-NEXT:    s_add_i32 s2, s2, s5
6239; GFX9-NEXT:    s_xor_b32 s2, s2, s5
6240; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
6241; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
6242; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
6243; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
6244; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
6245; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
6246; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
6247; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6248; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
6249; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
6250; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
6251; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
6252; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
6253; GFX9-NEXT:    s_xor_b32 s2, s5, s4
6254; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
6255; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
6256; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
6257; GFX9-NEXT:    s_endpgm
6258  %shl.y = shl i32 4096, %y
6259  %r = sdiv i32 %x, %shl.y
6260  store i32 %r, i32 addrspace(1)* %out
6261  ret void
6262}
6263
6264define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
6265; CHECK-LABEL: @sdiv_v2i32_pow2k_denom(
6266; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6267; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
6268; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
6269; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6270; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096
6271; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6272; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6273; CHECK-NEXT:    ret void
6274;
6275; GFX6-LABEL: sdiv_v2i32_pow2k_denom:
6276; GFX6:       ; %bb.0:
6277; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
6278; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6279; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6280; GFX6-NEXT:    s_mov_b32 s2, -1
6281; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6282; GFX6-NEXT:    s_ashr_i32 s6, s4, 31
6283; GFX6-NEXT:    s_ashr_i32 s7, s5, 31
6284; GFX6-NEXT:    s_lshr_b32 s6, s6, 20
6285; GFX6-NEXT:    s_add_i32 s4, s4, s6
6286; GFX6-NEXT:    s_lshr_b32 s6, s7, 20
6287; GFX6-NEXT:    s_add_i32 s5, s5, s6
6288; GFX6-NEXT:    s_ashr_i32 s4, s4, 12
6289; GFX6-NEXT:    s_ashr_i32 s5, s5, 12
6290; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6291; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6292; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6293; GFX6-NEXT:    s_endpgm
6294;
6295; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
6296; GFX9:       ; %bb.0:
6297; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6298; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
6299; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6300; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6301; GFX9-NEXT:    s_ashr_i32 s0, s2, 31
6302; GFX9-NEXT:    s_ashr_i32 s1, s3, 31
6303; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6304; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
6305; GFX9-NEXT:    s_add_i32 s0, s2, s0
6306; GFX9-NEXT:    s_add_i32 s1, s3, s1
6307; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
6308; GFX9-NEXT:    s_ashr_i32 s1, s1, 12
6309; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6310; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6311; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
6312; GFX9-NEXT:    s_endpgm
6313  %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
6314  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6315  ret void
6316}
6317
6318define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
6319; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom(
6320; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6321; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
6322; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
6323; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6324; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095
6325; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6326; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6327; CHECK-NEXT:    ret void
6328;
6329; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
6330; GFX6:       ; %bb.0:
6331; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
6332; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6333; GFX6-NEXT:    v_mov_b32_e32 v0, 0x80080081
6334; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6335; GFX6-NEXT:    s_mov_b32 s2, -1
6336; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6337; GFX6-NEXT:    v_mul_hi_i32 v0, s5, v0
6338; GFX6-NEXT:    s_ashr_i32 s6, s4, 31
6339; GFX6-NEXT:    s_lshr_b32 s6, s6, 20
6340; GFX6-NEXT:    s_add_i32 s4, s4, s6
6341; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
6342; GFX6-NEXT:    s_ashr_i32 s4, s4, 12
6343; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
6344; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
6345; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
6346; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6347; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6348; GFX6-NEXT:    s_endpgm
6349;
6350; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
6351; GFX9:       ; %bb.0:
6352; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6353; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
6354; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6355; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6356; GFX9-NEXT:    s_ashr_i32 s0, s2, 31
6357; GFX9-NEXT:    s_mul_hi_i32 s1, s3, 0x80080081
6358; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6359; GFX9-NEXT:    s_add_i32 s1, s1, s3
6360; GFX9-NEXT:    s_add_i32 s0, s2, s0
6361; GFX9-NEXT:    s_lshr_b32 s2, s1, 31
6362; GFX9-NEXT:    s_ashr_i32 s1, s1, 11
6363; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
6364; GFX9-NEXT:    s_add_i32 s1, s1, s2
6365; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6366; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6367; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
6368; GFX9-NEXT:    s_endpgm
6369  %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
6370  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6371  ret void
6372}
6373
6374define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
6375; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom(
6376; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
6377; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6378; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
6379; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
6380; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
6381; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
6382; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
6383; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
6384; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
6385; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
6386; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
6387; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
6388; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
6389; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
6390; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
6391; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
6392; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
6393; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
6394; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
6395; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
6396; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
6397; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
6398; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
6399; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
6400; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
6401; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
6402; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
6403; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
6404; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
6405; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
6406; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
6407; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
6408; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
6409; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
6410; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
6411; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
6412; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
6413; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
6414; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
6415; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
6416; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
6417; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0
6418; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1
6419; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
6420; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
6421; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
6422; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
6423; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
6424; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
6425; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
6426; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
6427; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
6428; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
6429; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
6430; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
6431; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
6432; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
6433; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
6434; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
6435; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
6436; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
6437; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
6438; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
6439; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
6440; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
6441; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
6442; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
6443; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
6444; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
6445; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
6446; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
6447; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
6448; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
6449; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
6450; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
6451; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
6452; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
6453; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
6454; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
6455; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
6456; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
6457; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
6458; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1
6459; CHECK-NEXT:    store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6460; CHECK-NEXT:    ret void
6461;
6462; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom:
6463; GFX6:       ; %bb.0:
6464; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xb
6465; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6466; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6467; GFX6-NEXT:    s_mov_b32 s6, -1
6468; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6469; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s10
6470; GFX6-NEXT:    s_ashr_i32 s1, s0, 31
6471; GFX6-NEXT:    s_add_i32 s0, s0, s1
6472; GFX6-NEXT:    s_xor_b32 s2, s0, s1
6473; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
6474; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s11
6475; GFX6-NEXT:    s_ashr_i32 s3, s0, 31
6476; GFX6-NEXT:    s_add_i32 s0, s0, s3
6477; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6478; GFX6-NEXT:    s_sub_i32 s11, 0, s2
6479; GFX6-NEXT:    s_xor_b32 s10, s0, s3
6480; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
6481; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6482; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6483; GFX6-NEXT:    s_ashr_i32 s0, s8, 31
6484; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6485; GFX6-NEXT:    s_add_i32 s8, s8, s0
6486; GFX6-NEXT:    v_mul_lo_u32 v2, s11, v0
6487; GFX6-NEXT:    s_xor_b32 s8, s8, s0
6488; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
6489; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
6490; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
6491; GFX6-NEXT:    s_xor_b32 s11, s0, s1
6492; GFX6-NEXT:    s_sub_i32 s0, 0, s10
6493; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
6494; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
6495; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
6496; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s2
6497; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
6498; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
6499; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
6500; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v3
6501; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
6502; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s2, v3
6503; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
6504; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
6505; GFX6-NEXT:    s_add_i32 s1, s9, s0
6506; GFX6-NEXT:    s_xor_b32 s1, s1, s0
6507; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
6508; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
6509; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
6510; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
6511; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s10
6512; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
6513; GFX6-NEXT:    s_xor_b32 s2, s0, s3
6514; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
6515; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
6516; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
6517; GFX6-NEXT:    v_xor_b32_e32 v0, s11, v0
6518; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
6519; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v2
6520; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s11, v0
6521; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
6522; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
6523; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
6524; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
6525; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
6526; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
6527; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6528; GFX6-NEXT:    s_endpgm
6529;
6530; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
6531; GFX9:       ; %bb.0:
6532; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
6533; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6534; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6535; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6536; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s6
6537; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
6538; GFX9-NEXT:    s_add_i32 s0, s0, s1
6539; GFX9-NEXT:    s_xor_b32 s0, s0, s1
6540; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
6541; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s7
6542; GFX9-NEXT:    s_ashr_i32 s8, s6, 31
6543; GFX9-NEXT:    s_add_i32 s6, s6, s8
6544; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6545; GFX9-NEXT:    s_xor_b32 s6, s6, s8
6546; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
6547; GFX9-NEXT:    s_sub_i32 s10, 0, s0
6548; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6549; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6550; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6551; GFX9-NEXT:    s_ashr_i32 s7, s4, 31
6552; GFX9-NEXT:    s_add_i32 s4, s4, s7
6553; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v0
6554; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
6555; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
6556; GFX9-NEXT:    s_sub_i32 s10, 0, s6
6557; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
6558; GFX9-NEXT:    s_xor_b32 s4, s4, s7
6559; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v1
6560; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
6561; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
6562; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
6563; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v4
6564; GFX9-NEXT:    s_add_i32 s5, s5, s9
6565; GFX9-NEXT:    s_xor_b32 s5, s5, s9
6566; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s0
6567; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
6568; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
6569; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
6570; GFX9-NEXT:    v_sub_u32_e32 v4, s4, v4
6571; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v4
6572; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6573; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v4
6574; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
6575; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
6576; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s6
6577; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
6578; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
6579; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
6580; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v3
6581; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
6582; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
6583; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
6584; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
6585; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
6586; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
6587; GFX9-NEXT:    s_xor_b32 s1, s7, s1
6588; GFX9-NEXT:    s_xor_b32 s0, s9, s8
6589; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
6590; GFX9-NEXT:    v_xor_b32_e32 v0, s1, v0
6591; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
6592; GFX9-NEXT:    v_subrev_u32_e32 v0, s1, v0
6593; GFX9-NEXT:    v_subrev_u32_e32 v1, s0, v1
6594; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
6595; GFX9-NEXT:    s_endpgm
6596  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
6597  %r = sdiv <2 x i32> %x, %shl.y
6598  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6599  ret void
6600}
6601
6602define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
6603; CHECK-LABEL: @srem_i32_oddk_denom(
6604; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 1235195
6605; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6606; CHECK-NEXT:    ret void
6607;
6608; GFX6-LABEL: srem_i32_oddk_denom:
6609; GFX6:       ; %bb.0:
6610; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
6611; GFX6-NEXT:    v_mov_b32_e32 v0, 0xd9528441
6612; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6613; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6614; GFX6-NEXT:    s_mov_b32 s2, -1
6615; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6616; GFX6-NEXT:    v_mul_hi_i32 v0, s4, v0
6617; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
6618; GFX6-NEXT:    s_add_i32 s5, s5, s4
6619; GFX6-NEXT:    s_lshr_b32 s6, s5, 31
6620; GFX6-NEXT:    s_ashr_i32 s5, s5, 20
6621; GFX6-NEXT:    s_add_i32 s5, s5, s6
6622; GFX6-NEXT:    s_mul_i32 s5, s5, 0x12d8fb
6623; GFX6-NEXT:    s_sub_i32 s4, s4, s5
6624; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6625; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6626; GFX6-NEXT:    s_endpgm
6627;
6628; GFX9-LABEL: srem_i32_oddk_denom:
6629; GFX9:       ; %bb.0:
6630; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6631; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6632; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6633; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6634; GFX9-NEXT:    s_mul_hi_i32 s0, s4, 0xd9528441
6635; GFX9-NEXT:    s_add_i32 s0, s0, s4
6636; GFX9-NEXT:    s_lshr_b32 s1, s0, 31
6637; GFX9-NEXT:    s_ashr_i32 s0, s0, 20
6638; GFX9-NEXT:    s_add_i32 s0, s0, s1
6639; GFX9-NEXT:    s_mul_i32 s0, s0, 0x12d8fb
6640; GFX9-NEXT:    s_sub_i32 s0, s4, s0
6641; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6642; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6643; GFX9-NEXT:    s_endpgm
6644  %r = srem i32 %x, 1235195
6645  store i32 %r, i32 addrspace(1)* %out
6646  ret void
6647}
6648
6649define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
6650; CHECK-LABEL: @srem_i32_pow2k_denom(
6651; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 4096
6652; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6653; CHECK-NEXT:    ret void
6654;
6655; GFX6-LABEL: srem_i32_pow2k_denom:
6656; GFX6:       ; %bb.0:
6657; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
6658; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6659; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6660; GFX6-NEXT:    s_mov_b32 s2, -1
6661; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6662; GFX6-NEXT:    s_ashr_i32 s5, s4, 31
6663; GFX6-NEXT:    s_lshr_b32 s5, s5, 20
6664; GFX6-NEXT:    s_add_i32 s5, s4, s5
6665; GFX6-NEXT:    s_and_b32 s5, s5, 0xfffff000
6666; GFX6-NEXT:    s_sub_i32 s4, s4, s5
6667; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6668; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6669; GFX6-NEXT:    s_endpgm
6670;
6671; GFX9-LABEL: srem_i32_pow2k_denom:
6672; GFX9:       ; %bb.0:
6673; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6674; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6675; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6676; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6677; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
6678; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6679; GFX9-NEXT:    s_add_i32 s0, s4, s0
6680; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
6681; GFX9-NEXT:    s_sub_i32 s0, s4, s0
6682; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6683; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6684; GFX9-NEXT:    s_endpgm
6685  %r = srem i32 %x, 4096
6686  store i32 %r, i32 addrspace(1)* %out
6687  ret void
6688}
6689
6690define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
6691; CHECK-LABEL: @srem_i32_pow2_shl_denom(
6692; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
6693; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]]
6694; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6695; CHECK-NEXT:    ret void
6696;
6697; GFX6-LABEL: srem_i32_pow2_shl_denom:
6698; GFX6:       ; %bb.0:
6699; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
6700; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6701; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6702; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
6703; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
6704; GFX6-NEXT:    s_add_i32 s3, s3, s4
6705; GFX6-NEXT:    s_xor_b32 s4, s3, s4
6706; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
6707; GFX6-NEXT:    s_sub_i32 s3, 0, s4
6708; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
6709; GFX6-NEXT:    s_add_i32 s2, s2, s5
6710; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6711; GFX6-NEXT:    s_xor_b32 s6, s2, s5
6712; GFX6-NEXT:    s_mov_b32 s2, -1
6713; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6714; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6715; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
6716; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6717; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6718; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6719; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
6720; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
6721; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
6722; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
6723; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
6724; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
6725; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
6726; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
6727; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
6728; GFX6-NEXT:    v_xor_b32_e32 v0, s5, v0
6729; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
6730; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6731; GFX6-NEXT:    s_endpgm
6732;
6733; GFX9-LABEL: srem_i32_pow2_shl_denom:
6734; GFX9:       ; %bb.0:
6735; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6736; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6737; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6738; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6739; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
6740; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
6741; GFX9-NEXT:    s_add_i32 s3, s3, s4
6742; GFX9-NEXT:    s_xor_b32 s3, s3, s4
6743; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
6744; GFX9-NEXT:    s_sub_i32 s5, 0, s3
6745; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
6746; GFX9-NEXT:    s_add_i32 s2, s2, s4
6747; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6748; GFX9-NEXT:    s_xor_b32 s2, s2, s4
6749; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6750; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6751; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
6752; GFX9-NEXT:    s_mul_i32 s5, s5, s6
6753; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
6754; GFX9-NEXT:    s_add_i32 s6, s6, s5
6755; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
6756; GFX9-NEXT:    s_mul_i32 s5, s5, s3
6757; GFX9-NEXT:    s_sub_i32 s2, s2, s5
6758; GFX9-NEXT:    s_sub_i32 s5, s2, s3
6759; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
6760; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
6761; GFX9-NEXT:    s_sub_i32 s5, s2, s3
6762; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
6763; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
6764; GFX9-NEXT:    s_xor_b32 s2, s2, s4
6765; GFX9-NEXT:    s_sub_i32 s2, s2, s4
6766; GFX9-NEXT:    v_mov_b32_e32 v0, s2
6767; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
6768; GFX9-NEXT:    s_endpgm
6769  %shl.y = shl i32 4096, %y
6770  %r = srem i32 %x, %shl.y
6771  store i32 %r, i32 addrspace(1)* %out
6772  ret void
6773}
6774
6775define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
6776; CHECK-LABEL: @srem_v2i32_pow2k_denom(
6777; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6778; CHECK-NEXT:    [[TMP2:%.*]] = srem i32 [[TMP1]], 4096
6779; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
6780; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6781; CHECK-NEXT:    [[TMP5:%.*]] = srem i32 [[TMP4]], 4096
6782; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6783; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6784; CHECK-NEXT:    ret void
6785;
6786; GFX6-LABEL: srem_v2i32_pow2k_denom:
6787; GFX6:       ; %bb.0:
6788; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
6789; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6790; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6791; GFX6-NEXT:    s_mov_b32 s2, -1
6792; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6793; GFX6-NEXT:    s_ashr_i32 s6, s4, 31
6794; GFX6-NEXT:    s_lshr_b32 s6, s6, 20
6795; GFX6-NEXT:    s_add_i32 s6, s4, s6
6796; GFX6-NEXT:    s_ashr_i32 s7, s5, 31
6797; GFX6-NEXT:    s_and_b32 s6, s6, 0xfffff000
6798; GFX6-NEXT:    s_sub_i32 s4, s4, s6
6799; GFX6-NEXT:    s_lshr_b32 s6, s7, 20
6800; GFX6-NEXT:    s_add_i32 s6, s5, s6
6801; GFX6-NEXT:    s_and_b32 s6, s6, 0xfffff000
6802; GFX6-NEXT:    s_sub_i32 s5, s5, s6
6803; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6804; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6805; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6806; GFX6-NEXT:    s_endpgm
6807;
6808; GFX9-LABEL: srem_v2i32_pow2k_denom:
6809; GFX9:       ; %bb.0:
6810; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6811; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
6812; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6813; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6814; GFX9-NEXT:    s_ashr_i32 s0, s2, 31
6815; GFX9-NEXT:    s_ashr_i32 s1, s3, 31
6816; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6817; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
6818; GFX9-NEXT:    s_add_i32 s0, s2, s0
6819; GFX9-NEXT:    s_add_i32 s1, s3, s1
6820; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
6821; GFX9-NEXT:    s_and_b32 s1, s1, 0xfffff000
6822; GFX9-NEXT:    s_sub_i32 s0, s2, s0
6823; GFX9-NEXT:    s_sub_i32 s1, s3, s1
6824; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6825; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6826; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
6827; GFX9-NEXT:    s_endpgm
6828  %r = srem <2 x i32> %x, <i32 4096, i32 4096>
6829  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6830  ret void
6831}
6832
6833define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
6834; CHECK-LABEL: @srem_v2i32_pow2_shl_denom(
6835; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
6836; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6837; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
6838; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
6839; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
6840; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
6841; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
6842; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
6843; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
6844; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
6845; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
6846; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
6847; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
6848; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
6849; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
6850; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
6851; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
6852; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
6853; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
6854; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
6855; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
6856; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
6857; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
6858; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
6859; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
6860; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
6861; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
6862; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
6863; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
6864; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
6865; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
6866; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
6867; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
6868; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
6869; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
6870; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
6871; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
6872; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
6873; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0
6874; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1
6875; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
6876; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
6877; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
6878; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
6879; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
6880; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
6881; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
6882; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
6883; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
6884; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
6885; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
6886; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
6887; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
6888; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
6889; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
6890; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
6891; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
6892; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
6893; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
6894; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
6895; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
6896; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
6897; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
6898; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
6899; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
6900; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
6901; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
6902; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
6903; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
6904; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
6905; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
6906; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
6907; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
6908; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
6909; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
6910; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
6911; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1
6912; CHECK-NEXT:    store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6913; CHECK-NEXT:    ret void
6914;
6915; GFX6-LABEL: srem_v2i32_pow2_shl_denom:
6916; GFX6:       ; %bb.0:
6917; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
6918; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6919; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6920; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s6
6921; GFX6-NEXT:    s_ashr_i32 s3, s2, 31
6922; GFX6-NEXT:    s_add_i32 s2, s2, s3
6923; GFX6-NEXT:    s_xor_b32 s6, s2, s3
6924; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
6925; GFX6-NEXT:    s_lshl_b32 s7, 0x1000, s7
6926; GFX6-NEXT:    s_ashr_i32 s8, s7, 31
6927; GFX6-NEXT:    s_add_i32 s7, s7, s8
6928; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6929; GFX6-NEXT:    s_xor_b32 s7, s7, s8
6930; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
6931; GFX6-NEXT:    s_sub_i32 s9, 0, s6
6932; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6933; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6934; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6935; GFX6-NEXT:    s_ashr_i32 s8, s4, 31
6936; GFX6-NEXT:    s_add_i32 s4, s4, s8
6937; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v0
6938; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
6939; GFX6-NEXT:    s_xor_b32 s4, s4, s8
6940; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
6941; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
6942; GFX6-NEXT:    s_sub_i32 s9, 0, s7
6943; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6944; GFX6-NEXT:    s_mov_b32 s2, -1
6945; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6946; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
6947; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v1
6948; GFX6-NEXT:    s_ashr_i32 s9, s5, 31
6949; GFX6-NEXT:    s_add_i32 s5, s5, s9
6950; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
6951; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
6952; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
6953; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
6954; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
6955; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6956; GFX6-NEXT:    s_xor_b32 s4, s5, s9
6957; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6958; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
6959; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
6960; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
6961; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
6962; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6963; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
6964; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
6965; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
6966; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
6967; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
6968; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6969; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
6970; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
6971; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6972; GFX6-NEXT:    v_xor_b32_e32 v1, s9, v1
6973; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v1
6974; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6975; GFX6-NEXT:    s_endpgm
6976;
6977; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
6978; GFX9:       ; %bb.0:
6979; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
6980; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6981; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6982; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6983; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s6
6984; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
6985; GFX9-NEXT:    s_add_i32 s2, s2, s3
6986; GFX9-NEXT:    s_xor_b32 s2, s2, s3
6987; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
6988; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s7
6989; GFX9-NEXT:    s_sub_i32 s7, 0, s2
6990; GFX9-NEXT:    s_ashr_i32 s6, s4, 31
6991; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6992; GFX9-NEXT:    s_add_i32 s4, s4, s6
6993; GFX9-NEXT:    s_xor_b32 s4, s4, s6
6994; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6995; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6996; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
6997; GFX9-NEXT:    s_mul_i32 s7, s7, s8
6998; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s7
6999; GFX9-NEXT:    s_add_i32 s8, s8, s7
7000; GFX9-NEXT:    s_mul_hi_u32 s7, s4, s8
7001; GFX9-NEXT:    s_mul_i32 s7, s7, s2
7002; GFX9-NEXT:    s_sub_i32 s4, s4, s7
7003; GFX9-NEXT:    s_sub_i32 s7, s4, s2
7004; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
7005; GFX9-NEXT:    s_cselect_b32 s4, s7, s4
7006; GFX9-NEXT:    s_sub_i32 s7, s4, s2
7007; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
7008; GFX9-NEXT:    s_cselect_b32 s2, s7, s4
7009; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
7010; GFX9-NEXT:    s_add_i32 s3, s3, s4
7011; GFX9-NEXT:    s_xor_b32 s3, s3, s4
7012; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
7013; GFX9-NEXT:    s_xor_b32 s2, s2, s6
7014; GFX9-NEXT:    s_sub_i32 s2, s2, s6
7015; GFX9-NEXT:    s_sub_i32 s6, 0, s3
7016; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
7017; GFX9-NEXT:    s_ashr_i32 s4, s5, 31
7018; GFX9-NEXT:    s_add_i32 s5, s5, s4
7019; GFX9-NEXT:    s_xor_b32 s5, s5, s4
7020; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
7021; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7022; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
7023; GFX9-NEXT:    s_mul_i32 s6, s6, s7
7024; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
7025; GFX9-NEXT:    s_add_i32 s7, s7, s6
7026; GFX9-NEXT:    s_mul_hi_u32 s6, s5, s7
7027; GFX9-NEXT:    s_mul_i32 s6, s6, s3
7028; GFX9-NEXT:    s_sub_i32 s5, s5, s6
7029; GFX9-NEXT:    s_sub_i32 s6, s5, s3
7030; GFX9-NEXT:    s_cmp_ge_u32 s5, s3
7031; GFX9-NEXT:    s_cselect_b32 s5, s6, s5
7032; GFX9-NEXT:    s_sub_i32 s6, s5, s3
7033; GFX9-NEXT:    s_cmp_ge_u32 s5, s3
7034; GFX9-NEXT:    s_cselect_b32 s3, s6, s5
7035; GFX9-NEXT:    s_xor_b32 s3, s3, s4
7036; GFX9-NEXT:    s_sub_i32 s3, s3, s4
7037; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7038; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7039; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
7040; GFX9-NEXT:    s_endpgm
7041  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
7042  %r = srem <2 x i32> %x, %shl.y
7043  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
7044  ret void
7045}
7046
7047define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
7048; CHECK-LABEL: @udiv_i64_oddk_denom(
7049; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
7050; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7051; CHECK-NEXT:    ret void
7052;
7053; GFX6-LABEL: udiv_i64_oddk_denom:
7054; GFX6:       ; %bb.0:
7055; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
7056; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7057; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7058; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
7059; GFX6-NEXT:    s_movk_i32 s4, 0xfee0
7060; GFX6-NEXT:    s_mov_b32 s5, 0x68958c89
7061; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
7062; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7063; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7064; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
7065; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7066; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7067; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7068; GFX6-NEXT:    s_movk_i32 s8, 0x11f
7069; GFX6-NEXT:    s_mov_b32 s9, 0x976a7377
7070; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s4
7071; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
7072; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s5
7073; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s5
7074; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7075; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
7076; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7077; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
7078; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
7079; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
7080; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
7081; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7082; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
7083; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
7084; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
7085; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
7086; GFX6-NEXT:    s_mov_b32 s6, -1
7087; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
7088; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
7089; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
7090; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7091; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7092; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7093; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
7094; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s4
7095; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
7096; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s5
7097; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7098; GFX6-NEXT:    s_mov_b32 s4, s0
7099; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
7100; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
7101; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7102; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
7103; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
7104; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
7105; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
7106; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7107; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
7108; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
7109; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
7110; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
7111; GFX6-NEXT:    s_mov_b32 s5, s1
7112; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
7113; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
7114; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
7115; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7116; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7117; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7118; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
7119; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
7120; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
7121; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
7122; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
7123; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
7124; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7125; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7126; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
7127; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
7128; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7129; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
7130; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
7131; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7132; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
7133; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
7134; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s9
7135; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
7136; GFX6-NEXT:    v_mov_b32_e32 v5, 0x11f
7137; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7138; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s9
7139; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7140; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
7141; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
7142; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
7143; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s9, v3
7144; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
7145; GFX6-NEXT:    s_movk_i32 s2, 0x11e
7146; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s2, v4
7147; GFX6-NEXT:    s_mov_b32 s9, 0x976a7376
7148; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
7149; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s9, v5
7150; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
7151; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s8, v4
7152; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
7153; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
7154; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
7155; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
7156; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
7157; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
7158; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
7159; GFX6-NEXT:    v_mov_b32_e32 v6, s3
7160; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
7161; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v2
7162; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7163; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v3
7164; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
7165; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v2
7166; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
7167; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
7168; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
7169; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
7170; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7171; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7172; GFX6-NEXT:    s_endpgm
7173;
7174; GFX9-LABEL: udiv_i64_oddk_denom:
7175; GFX9:       ; %bb.0:
7176; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
7177; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7178; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7179; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
7180; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
7181; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7182; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7183; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7184; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
7185; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7186; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7187; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7188; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
7189; GFX9-NEXT:    s_mul_i32 s1, s0, 0xfffffee0
7190; GFX9-NEXT:    s_mul_hi_u32 s2, s0, 0x68958c89
7191; GFX9-NEXT:    s_add_i32 s1, s2, s1
7192; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
7193; GFX9-NEXT:    s_mul_i32 s3, s2, 0x68958c89
7194; GFX9-NEXT:    s_add_i32 s1, s1, s3
7195; GFX9-NEXT:    s_mul_i32 s9, s0, 0x68958c89
7196; GFX9-NEXT:    s_mul_hi_u32 s3, s0, s1
7197; GFX9-NEXT:    s_mul_i32 s8, s0, s1
7198; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s9
7199; GFX9-NEXT:    s_add_u32 s0, s0, s8
7200; GFX9-NEXT:    s_addc_u32 s3, 0, s3
7201; GFX9-NEXT:    s_mul_hi_u32 s10, s2, s9
7202; GFX9-NEXT:    s_mul_i32 s9, s2, s9
7203; GFX9-NEXT:    s_add_u32 s0, s0, s9
7204; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s1
7205; GFX9-NEXT:    s_addc_u32 s0, s3, s10
7206; GFX9-NEXT:    s_addc_u32 s3, s8, 0
7207; GFX9-NEXT:    s_mul_i32 s1, s2, s1
7208; GFX9-NEXT:    s_add_u32 s0, s0, s1
7209; GFX9-NEXT:    s_addc_u32 s1, 0, s3
7210; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
7211; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
7212; GFX9-NEXT:    s_addc_u32 s0, s2, s1
7213; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
7214; GFX9-NEXT:    s_mul_i32 s3, s2, 0xfffffee0
7215; GFX9-NEXT:    s_mul_hi_u32 s8, s2, 0x68958c89
7216; GFX9-NEXT:    s_mul_i32 s1, s0, 0x68958c89
7217; GFX9-NEXT:    s_add_i32 s3, s8, s3
7218; GFX9-NEXT:    s_add_i32 s3, s3, s1
7219; GFX9-NEXT:    s_mul_i32 s9, s2, 0x68958c89
7220; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
7221; GFX9-NEXT:    s_mul_i32 s8, s2, s3
7222; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s9
7223; GFX9-NEXT:    s_add_u32 s2, s2, s8
7224; GFX9-NEXT:    s_addc_u32 s1, 0, s1
7225; GFX9-NEXT:    s_mul_hi_u32 s10, s0, s9
7226; GFX9-NEXT:    s_mul_i32 s9, s0, s9
7227; GFX9-NEXT:    s_add_u32 s2, s2, s9
7228; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s3
7229; GFX9-NEXT:    s_addc_u32 s1, s1, s10
7230; GFX9-NEXT:    s_addc_u32 s2, s8, 0
7231; GFX9-NEXT:    s_mul_i32 s3, s0, s3
7232; GFX9-NEXT:    s_add_u32 s1, s1, s3
7233; GFX9-NEXT:    s_addc_u32 s2, 0, s2
7234; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v0
7235; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
7236; GFX9-NEXT:    s_addc_u32 s0, s0, s2
7237; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
7238; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7239; GFX9-NEXT:    s_mul_i32 s2, s6, s0
7240; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s3
7241; GFX9-NEXT:    s_mul_hi_u32 s1, s6, s0
7242; GFX9-NEXT:    s_add_u32 s2, s8, s2
7243; GFX9-NEXT:    s_addc_u32 s1, 0, s1
7244; GFX9-NEXT:    s_mul_hi_u32 s9, s7, s3
7245; GFX9-NEXT:    s_mul_i32 s3, s7, s3
7246; GFX9-NEXT:    s_add_u32 s2, s2, s3
7247; GFX9-NEXT:    s_mul_hi_u32 s8, s7, s0
7248; GFX9-NEXT:    s_addc_u32 s1, s1, s9
7249; GFX9-NEXT:    s_addc_u32 s2, s8, 0
7250; GFX9-NEXT:    s_mul_i32 s0, s7, s0
7251; GFX9-NEXT:    s_add_u32 s3, s1, s0
7252; GFX9-NEXT:    s_addc_u32 s2, 0, s2
7253; GFX9-NEXT:    s_mul_i32 s0, s3, 0x11f
7254; GFX9-NEXT:    s_mul_hi_u32 s8, s3, 0x976a7377
7255; GFX9-NEXT:    s_add_i32 s0, s8, s0
7256; GFX9-NEXT:    s_mul_i32 s8, s2, 0x976a7377
7257; GFX9-NEXT:    s_mul_i32 s9, s3, 0x976a7377
7258; GFX9-NEXT:    s_add_i32 s8, s0, s8
7259; GFX9-NEXT:    v_mov_b32_e32 v0, s9
7260; GFX9-NEXT:    s_sub_i32 s0, s7, s8
7261; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
7262; GFX9-NEXT:    s_mov_b32 s1, 0x976a7377
7263; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
7264; GFX9-NEXT:    s_subb_u32 s6, s0, 0x11f
7265; GFX9-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s1, v0
7266; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
7267; GFX9-NEXT:    s_subb_u32 s6, s6, 0
7268; GFX9-NEXT:    s_cmpk_gt_u32 s6, 0x11e
7269; GFX9-NEXT:    s_mov_b32 s10, 0x976a7376
7270; GFX9-NEXT:    s_cselect_b32 s9, -1, 0
7271; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s10, v1
7272; GFX9-NEXT:    s_cmpk_eq_i32 s6, 0x11f
7273; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
7274; GFX9-NEXT:    v_mov_b32_e32 v3, s9
7275; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
7276; GFX9-NEXT:    s_add_u32 s6, s3, 2
7277; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
7278; GFX9-NEXT:    s_addc_u32 s0, s2, 0
7279; GFX9-NEXT:    s_add_u32 s9, s3, 1
7280; GFX9-NEXT:    s_addc_u32 s1, s2, 0
7281; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
7282; GFX9-NEXT:    s_subb_u32 s7, s7, s8
7283; GFX9-NEXT:    s_cmpk_gt_u32 s7, 0x11e
7284; GFX9-NEXT:    v_mov_b32_e32 v3, s1
7285; GFX9-NEXT:    v_mov_b32_e32 v4, s0
7286; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
7287; GFX9-NEXT:    s_cselect_b32 s8, -1, 0
7288; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
7289; GFX9-NEXT:    s_cmpk_eq_i32 s7, 0x11f
7290; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v4, s[0:1]
7291; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
7292; GFX9-NEXT:    v_mov_b32_e32 v3, s8
7293; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
7294; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
7295; GFX9-NEXT:    v_mov_b32_e32 v3, s2
7296; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
7297; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
7298; GFX9-NEXT:    v_mov_b32_e32 v0, s9
7299; GFX9-NEXT:    v_mov_b32_e32 v3, s6
7300; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
7301; GFX9-NEXT:    v_mov_b32_e32 v3, s3
7302; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
7303; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
7304; GFX9-NEXT:    s_endpgm
7305  %r = udiv i64 %x, 1235195949943
7306  store i64 %r, i64 addrspace(1)* %out
7307  ret void
7308}
7309
7310define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
7311; CHECK-LABEL: @udiv_i64_pow2k_denom(
7312; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
7313; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7314; CHECK-NEXT:    ret void
7315;
7316; GFX6-LABEL: udiv_i64_pow2k_denom:
7317; GFX6:       ; %bb.0:
7318; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
7319; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7320; GFX6-NEXT:    s_mov_b32 s6, -1
7321; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7322; GFX6-NEXT:    s_mov_b32 s4, s0
7323; GFX6-NEXT:    s_mov_b32 s5, s1
7324; GFX6-NEXT:    s_lshr_b64 s[0:1], s[2:3], 12
7325; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7326; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7327; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7328; GFX6-NEXT:    s_endpgm
7329;
7330; GFX9-LABEL: udiv_i64_pow2k_denom:
7331; GFX9:       ; %bb.0:
7332; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
7333; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7334; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7335; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
7336; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7337; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7338; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
7339; GFX9-NEXT:    s_endpgm
7340  %r = udiv i64 %x, 4096
7341  store i64 %r, i64 addrspace(1)* %out
7342  ret void
7343}
7344
7345define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
7346; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
7347; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
7348; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
7349; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7350; CHECK-NEXT:    ret void
7351;
7352; GFX6-LABEL: udiv_i64_pow2_shl_denom:
7353; GFX6:       ; %bb.0:
7354; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
7355; GFX6-NEXT:    s_load_dword s8, s[0:1], 0xd
7356; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7357; GFX6-NEXT:    s_mov_b32 s2, -1
7358; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7359; GFX6-NEXT:    s_mov_b32 s0, s4
7360; GFX6-NEXT:    s_add_i32 s8, s8, 12
7361; GFX6-NEXT:    s_mov_b32 s1, s5
7362; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
7363; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7364; GFX6-NEXT:    v_mov_b32_e32 v1, s5
7365; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
7366; GFX6-NEXT:    s_endpgm
7367;
7368; GFX9-LABEL: udiv_i64_pow2_shl_denom:
7369; GFX9:       ; %bb.0:
7370; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
7371; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
7372; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7373; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7374; GFX9-NEXT:    s_add_i32 s2, s2, 12
7375; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s2
7376; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7377; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7378; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
7379; GFX9-NEXT:    s_endpgm
7380  %shl.y = shl i64 4096, %y
7381  %r = udiv i64 %x, %shl.y
7382  store i64 %r, i64 addrspace(1)* %out
7383  ret void
7384}
7385
7386define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
7387; CHECK-LABEL: @udiv_v2i64_pow2k_denom(
7388; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7389; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
7390; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
7391; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7392; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096
7393; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7394; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
7395; CHECK-NEXT:    ret void
7396;
7397; GFX6-LABEL: udiv_v2i64_pow2k_denom:
7398; GFX6:       ; %bb.0:
7399; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
7400; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
7401; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7402; GFX6-NEXT:    s_mov_b32 s2, -1
7403; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7404; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], 12
7405; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], 12
7406; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7407; GFX6-NEXT:    v_mov_b32_e32 v1, s5
7408; GFX6-NEXT:    v_mov_b32_e32 v2, s6
7409; GFX6-NEXT:    v_mov_b32_e32 v3, s7
7410; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
7411; GFX6-NEXT:    s_endpgm
7412;
7413; GFX9-LABEL: udiv_v2i64_pow2k_denom:
7414; GFX9:       ; %bb.0:
7415; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
7416; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7417; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7418; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7419; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 12
7420; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 12
7421; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7422; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7423; GFX9-NEXT:    v_mov_b32_e32 v2, s4
7424; GFX9-NEXT:    v_mov_b32_e32 v3, s5
7425; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
7426; GFX9-NEXT:    s_endpgm
7427  %r = udiv <2 x i64> %x, <i64 4096, i64 4096>
7428  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
7429  ret void
7430}
7431
7432define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
7433; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom(
7434; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7435; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
7436; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
7437; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7438; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095
7439; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7440; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
7441; CHECK-NEXT:    ret void
7442;
7443; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom:
7444; GFX6:       ; %bb.0:
7445; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
7446; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
7447; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
7448; GFX6-NEXT:    s_movk_i32 s6, 0xf001
7449; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7450; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
7451; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7452; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7453; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7454; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
7455; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7456; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7457; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7458; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7459; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], 12
7460; GFX6-NEXT:    s_movk_i32 s0, 0xfff
7461; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
7462; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s6
7463; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
7464; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
7465; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7466; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
7467; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
7468; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
7469; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
7470; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7471; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
7472; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
7473; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
7474; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
7475; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
7476; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
7477; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
7478; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7479; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7480; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7481; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
7482; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
7483; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s6
7484; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s6
7485; GFX6-NEXT:    s_mov_b32 s6, -1
7486; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
7487; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7488; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
7489; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v4
7490; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
7491; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
7492; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7493; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
7494; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
7495; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
7496; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
7497; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
7498; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
7499; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
7500; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7501; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7502; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7503; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
7504; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
7505; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
7506; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
7507; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
7508; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
7509; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7510; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7511; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
7512; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
7513; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7514; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
7515; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
7516; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7517; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
7518; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s0
7519; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s0
7520; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
7521; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s0
7522; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
7523; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
7524; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
7525; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
7526; GFX6-NEXT:    v_mov_b32_e32 v5, s3
7527; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s2, v8
7528; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
7529; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v8
7530; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
7531; GFX6-NEXT:    s_movk_i32 s0, 0xffe
7532; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
7533; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
7534; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
7535; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
7536; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
7537; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
7538; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
7539; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
7540; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
7541; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
7542; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
7543; GFX6-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
7544; GFX6-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
7545; GFX6-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
7546; GFX6-NEXT:    v_mov_b32_e32 v0, s8
7547; GFX6-NEXT:    v_mov_b32_e32 v1, s9
7548; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
7549; GFX6-NEXT:    s_endpgm
7550;
7551; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
7552; GFX9:       ; %bb.0:
7553; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
7554; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
7555; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
7556; GFX9-NEXT:    s_movk_i32 s2, 0xf001
7557; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7558; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7559; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
7560; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7561; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7562; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7563; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s2
7564; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s2
7565; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s2
7566; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
7567; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
7568; GFX9-NEXT:    v_mul_hi_u32 v5, v0, v3
7569; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
7570; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
7571; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v3
7572; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
7573; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
7574; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
7575; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
7576; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
7577; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
7578; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
7579; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
7580; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7581; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
7582; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
7583; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
7584; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s2
7585; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s2
7586; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s2
7587; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7588; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
7589; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
7590; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
7591; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
7592; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
7593; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
7594; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
7595; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
7596; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
7597; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
7598; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v5
7599; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
7600; GFX9-NEXT:    s_movk_i32 s0, 0xfff
7601; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7602; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], 12
7603; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
7604; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
7605; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v8, vcc
7606; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7607; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
7608; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
7609; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
7610; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
7611; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
7612; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
7613; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
7614; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
7615; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7616; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
7617; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
7618; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
7619; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7620; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
7621; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
7622; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v6, vcc
7623; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
7624; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
7625; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
7626; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s0
7627; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s0
7628; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s0
7629; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
7630; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
7631; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
7632; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
7633; GFX9-NEXT:    v_mov_b32_e32 v6, s7
7634; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s6, v9
7635; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
7636; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s0, v9
7637; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
7638; GFX9-NEXT:    s_movk_i32 s0, 0xffe
7639; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
7640; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7641; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
7642; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
7643; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
7644; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
7645; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
7646; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
7647; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v6, s[0:1]
7648; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
7649; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
7650; GFX9-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
7651; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
7652; GFX9-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
7653; GFX9-NEXT:    v_mov_b32_e32 v0, s4
7654; GFX9-NEXT:    v_mov_b32_e32 v1, s5
7655; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
7656; GFX9-NEXT:    s_endpgm
7657  %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
7658  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
7659  ret void
7660}
7661
7662define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
7663; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom(
7664; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
7665; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7666; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
7667; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]]
7668; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
7669; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
7670; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
7671; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
7672; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
7673; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
7674; CHECK-NEXT:    ret void
7675;
7676; GFX6-LABEL: udiv_v2i64_pow2_shl_denom:
7677; GFX6:       ; %bb.0:
7678; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
7679; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
7680; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7681; GFX6-NEXT:    s_mov_b32 s2, -1
7682; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7683; GFX6-NEXT:    s_add_i32 s8, s8, 12
7684; GFX6-NEXT:    s_add_i32 s9, s10, 12
7685; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
7686; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
7687; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7688; GFX6-NEXT:    v_mov_b32_e32 v1, s5
7689; GFX6-NEXT:    v_mov_b32_e32 v2, s6
7690; GFX6-NEXT:    v_mov_b32_e32 v3, s7
7691; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
7692; GFX6-NEXT:    s_endpgm
7693;
7694; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
7695; GFX9:       ; %bb.0:
7696; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
7697; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7698; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
7699; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7700; GFX9-NEXT:    s_add_i32 s2, s8, 12
7701; GFX9-NEXT:    s_add_i32 s8, s10, 12
7702; GFX9-NEXT:    s_lshr_b64 s[2:3], s[4:5], s2
7703; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
7704; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7705; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7706; GFX9-NEXT:    v_mov_b32_e32 v2, s4
7707; GFX9-NEXT:    v_mov_b32_e32 v3, s5
7708; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
7709; GFX9-NEXT:    s_endpgm
7710  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
7711  %r = udiv <2 x i64> %x, %shl.y
7712  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
7713  ret void
7714}
7715
7716define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
7717; CHECK-LABEL: @urem_i64_oddk_denom(
7718; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
7719; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7720; CHECK-NEXT:    ret void
7721;
7722; GFX6-LABEL: urem_i64_oddk_denom:
7723; GFX6:       ; %bb.0:
7724; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
7725; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7726; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7727; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
7728; GFX6-NEXT:    s_movk_i32 s2, 0xfee0
7729; GFX6-NEXT:    s_mov_b32 s3, 0x689e0837
7730; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
7731; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7732; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7733; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
7734; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7735; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7736; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7737; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7738; GFX6-NEXT:    s_mov_b32 s8, s4
7739; GFX6-NEXT:    s_movk_i32 s4, 0x11f
7740; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
7741; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
7742; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
7743; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
7744; GFX6-NEXT:    s_mov_b32 s12, 0x9761f7c9
7745; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
7746; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7747; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
7748; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
7749; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
7750; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
7751; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7752; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
7753; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
7754; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
7755; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
7756; GFX6-NEXT:    s_mov_b32 s9, s5
7757; GFX6-NEXT:    s_movk_i32 s5, 0x11e
7758; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
7759; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
7760; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
7761; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7762; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7763; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7764; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
7765; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
7766; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
7767; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
7768; GFX6-NEXT:    s_mov_b32 s11, 0xf000
7769; GFX6-NEXT:    s_mov_b32 s10, -1
7770; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
7771; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
7772; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7773; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
7774; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
7775; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
7776; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
7777; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7778; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
7779; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
7780; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
7781; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
7782; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
7783; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
7784; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
7785; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7786; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7787; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7788; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
7789; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
7790; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
7791; GFX6-NEXT:    v_mul_hi_u32 v4, s6, v1
7792; GFX6-NEXT:    v_mul_hi_u32 v5, s7, v1
7793; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v1
7794; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7795; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7796; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
7797; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
7798; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7799; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
7800; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
7801; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7802; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
7803; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s4
7804; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s12
7805; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s12
7806; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s12
7807; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
7808; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
7809; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
7810; GFX6-NEXT:    v_mov_b32_e32 v3, 0x11f
7811; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
7812; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
7813; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
7814; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
7815; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s5, v5
7816; GFX6-NEXT:    s_mov_b32 s6, 0x9761f7c8
7817; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
7818; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v4
7819; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
7820; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
7821; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, v5
7822; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
7823; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
7824; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
7825; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
7826; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
7827; GFX6-NEXT:    v_mov_b32_e32 v5, s7
7828; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
7829; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v1
7830; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
7831; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
7832; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7833; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
7834; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
7835; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
7836; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
7837; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
7838; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7839; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
7840; GFX6-NEXT:    s_endpgm
7841;
7842; GFX9-LABEL: urem_i64_oddk_denom:
7843; GFX9:       ; %bb.0:
7844; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
7845; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7846; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7847; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
7848; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
7849; GFX9-NEXT:    s_mov_b32 s12, 0x9761f7c8
7850; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7851; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7852; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7853; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
7854; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7855; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7856; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7857; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
7858; GFX9-NEXT:    s_mul_i32 s1, s0, 0xfffffee0
7859; GFX9-NEXT:    s_mul_hi_u32 s2, s0, 0x689e0837
7860; GFX9-NEXT:    s_add_i32 s1, s2, s1
7861; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
7862; GFX9-NEXT:    s_mul_i32 s3, s2, 0x689e0837
7863; GFX9-NEXT:    s_add_i32 s1, s1, s3
7864; GFX9-NEXT:    s_mul_i32 s9, s0, 0x689e0837
7865; GFX9-NEXT:    s_mul_hi_u32 s3, s0, s1
7866; GFX9-NEXT:    s_mul_i32 s8, s0, s1
7867; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s9
7868; GFX9-NEXT:    s_add_u32 s0, s0, s8
7869; GFX9-NEXT:    s_addc_u32 s3, 0, s3
7870; GFX9-NEXT:    s_mul_hi_u32 s10, s2, s9
7871; GFX9-NEXT:    s_mul_i32 s9, s2, s9
7872; GFX9-NEXT:    s_add_u32 s0, s0, s9
7873; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s1
7874; GFX9-NEXT:    s_addc_u32 s0, s3, s10
7875; GFX9-NEXT:    s_addc_u32 s3, s8, 0
7876; GFX9-NEXT:    s_mul_i32 s1, s2, s1
7877; GFX9-NEXT:    s_add_u32 s0, s0, s1
7878; GFX9-NEXT:    s_addc_u32 s1, 0, s3
7879; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
7880; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
7881; GFX9-NEXT:    s_addc_u32 s0, s2, s1
7882; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
7883; GFX9-NEXT:    s_mul_i32 s3, s2, 0xfffffee0
7884; GFX9-NEXT:    s_mul_hi_u32 s8, s2, 0x689e0837
7885; GFX9-NEXT:    s_mul_i32 s1, s0, 0x689e0837
7886; GFX9-NEXT:    s_add_i32 s3, s8, s3
7887; GFX9-NEXT:    s_add_i32 s3, s3, s1
7888; GFX9-NEXT:    s_mul_i32 s9, s2, 0x689e0837
7889; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
7890; GFX9-NEXT:    s_mul_i32 s8, s2, s3
7891; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s9
7892; GFX9-NEXT:    s_add_u32 s2, s2, s8
7893; GFX9-NEXT:    s_addc_u32 s1, 0, s1
7894; GFX9-NEXT:    s_mul_hi_u32 s10, s0, s9
7895; GFX9-NEXT:    s_mul_i32 s9, s0, s9
7896; GFX9-NEXT:    s_add_u32 s2, s2, s9
7897; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s3
7898; GFX9-NEXT:    s_addc_u32 s1, s1, s10
7899; GFX9-NEXT:    s_addc_u32 s2, s8, 0
7900; GFX9-NEXT:    s_mul_i32 s3, s0, s3
7901; GFX9-NEXT:    s_add_u32 s1, s1, s3
7902; GFX9-NEXT:    s_addc_u32 s2, 0, s2
7903; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v0
7904; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
7905; GFX9-NEXT:    s_addc_u32 s0, s0, s2
7906; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
7907; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7908; GFX9-NEXT:    s_mul_i32 s2, s6, s0
7909; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s3
7910; GFX9-NEXT:    s_mul_hi_u32 s1, s6, s0
7911; GFX9-NEXT:    s_add_u32 s2, s8, s2
7912; GFX9-NEXT:    s_addc_u32 s1, 0, s1
7913; GFX9-NEXT:    s_mul_hi_u32 s9, s7, s3
7914; GFX9-NEXT:    s_mul_i32 s3, s7, s3
7915; GFX9-NEXT:    s_add_u32 s2, s2, s3
7916; GFX9-NEXT:    s_mul_hi_u32 s8, s7, s0
7917; GFX9-NEXT:    s_addc_u32 s1, s1, s9
7918; GFX9-NEXT:    s_addc_u32 s2, s8, 0
7919; GFX9-NEXT:    s_mul_i32 s0, s7, s0
7920; GFX9-NEXT:    s_add_u32 s0, s1, s0
7921; GFX9-NEXT:    s_addc_u32 s1, 0, s2
7922; GFX9-NEXT:    s_mul_i32 s2, s0, 0x11f
7923; GFX9-NEXT:    s_mul_hi_u32 s3, s0, 0x9761f7c9
7924; GFX9-NEXT:    s_add_i32 s2, s3, s2
7925; GFX9-NEXT:    s_mul_i32 s1, s1, 0x9761f7c9
7926; GFX9-NEXT:    s_mul_i32 s0, s0, 0x9761f7c9
7927; GFX9-NEXT:    s_add_i32 s9, s2, s1
7928; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7929; GFX9-NEXT:    s_sub_i32 s1, s7, s9
7930; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
7931; GFX9-NEXT:    s_mov_b32 s8, 0x9761f7c9
7932; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
7933; GFX9-NEXT:    s_subb_u32 s6, s1, 0x11f
7934; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s8, v0
7935; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
7936; GFX9-NEXT:    s_subb_u32 s10, s6, 0
7937; GFX9-NEXT:    s_cmpk_gt_u32 s10, 0x11e
7938; GFX9-NEXT:    s_cselect_b32 s11, -1, 0
7939; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s12, v3
7940; GFX9-NEXT:    s_cmpk_eq_i32 s10, 0x11f
7941; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[2:3]
7942; GFX9-NEXT:    v_mov_b32_e32 v4, s11
7943; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
7944; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
7945; GFX9-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[2:3]
7946; GFX9-NEXT:    s_subb_u32 s2, s6, 0x11f
7947; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v3
7948; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
7949; GFX9-NEXT:    s_subb_u32 s0, s2, 0
7950; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
7951; GFX9-NEXT:    s_subb_u32 s2, s7, s9
7952; GFX9-NEXT:    s_cmpk_gt_u32 s2, 0x11e
7953; GFX9-NEXT:    v_mov_b32_e32 v5, s10
7954; GFX9-NEXT:    v_mov_b32_e32 v6, s0
7955; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
7956; GFX9-NEXT:    s_cselect_b32 s3, -1, 0
7957; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s12, v0
7958; GFX9-NEXT:    s_cmpk_eq_i32 s2, 0x11f
7959; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, v6, s[0:1]
7960; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
7961; GFX9-NEXT:    v_mov_b32_e32 v6, s3
7962; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
7963; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
7964; GFX9-NEXT:    v_mov_b32_e32 v6, s2
7965; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
7966; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
7967; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
7968; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
7969; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
7970; GFX9-NEXT:    s_endpgm
7971  %r = urem i64 %x, 1235195393993
7972  store i64 %r, i64 addrspace(1)* %out
7973  ret void
7974}
7975
7976define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
7977; CHECK-LABEL: @urem_i64_pow2k_denom(
7978; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
7979; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7980; CHECK-NEXT:    ret void
7981;
7982; GFX6-LABEL: urem_i64_pow2k_denom:
7983; GFX6:       ; %bb.0:
7984; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
7985; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7986; GFX6-NEXT:    s_mov_b32 s6, -1
7987; GFX6-NEXT:    v_mov_b32_e32 v1, 0
7988; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7989; GFX6-NEXT:    s_mov_b32 s4, s0
7990; GFX6-NEXT:    s_and_b32 s0, s2, 0xfff
7991; GFX6-NEXT:    s_mov_b32 s5, s1
7992; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7993; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7994; GFX6-NEXT:    s_endpgm
7995;
7996; GFX9-LABEL: urem_i64_pow2k_denom:
7997; GFX9:       ; %bb.0:
7998; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
7999; GFX9-NEXT:    v_mov_b32_e32 v1, 0
8000; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8001; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
8002; GFX9-NEXT:    v_mov_b32_e32 v0, s2
8003; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
8004; GFX9-NEXT:    s_endpgm
8005  %r = urem i64 %x, 4096
8006  store i64 %r, i64 addrspace(1)* %out
8007  ret void
8008}
8009
8010define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
8011; CHECK-LABEL: @urem_i64_pow2_shl_denom(
8012; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
8013; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
8014; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8015; CHECK-NEXT:    ret void
8016;
8017; GFX6-LABEL: urem_i64_pow2_shl_denom:
8018; GFX6:       ; %bb.0:
8019; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
8020; GFX6-NEXT:    s_load_dword s8, s[0:1], 0xd
8021; GFX6-NEXT:    s_mov_b32 s3, 0xf000
8022; GFX6-NEXT:    s_mov_b32 s2, -1
8023; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8024; GFX6-NEXT:    s_mov_b32 s0, s4
8025; GFX6-NEXT:    s_mov_b32 s1, s5
8026; GFX6-NEXT:    s_mov_b64 s[4:5], 0x1000
8027; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
8028; GFX6-NEXT:    s_add_u32 s4, s4, -1
8029; GFX6-NEXT:    s_addc_u32 s5, s5, -1
8030; GFX6-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
8031; GFX6-NEXT:    v_mov_b32_e32 v0, s4
8032; GFX6-NEXT:    v_mov_b32_e32 v1, s5
8033; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
8034; GFX6-NEXT:    s_endpgm
8035;
8036; GFX9-LABEL: urem_i64_pow2_shl_denom:
8037; GFX9:       ; %bb.0:
8038; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
8039; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
8040; GFX9-NEXT:    s_mov_b64 s[0:1], 0x1000
8041; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8042; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8043; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
8044; GFX9-NEXT:    s_add_u32 s0, s0, -1
8045; GFX9-NEXT:    s_addc_u32 s1, s1, -1
8046; GFX9-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
8047; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8048; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8049; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
8050; GFX9-NEXT:    s_endpgm
8051  %shl.y = shl i64 4096, %y
8052  %r = urem i64 %x, %shl.y
8053  store i64 %r, i64 addrspace(1)* %out
8054  ret void
8055}
8056
8057define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
8058; CHECK-LABEL: @urem_v2i64_pow2k_denom(
8059; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8060; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 4096
8061; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
8062; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8063; CHECK-NEXT:    [[TMP5:%.*]] = urem i64 [[TMP4]], 4096
8064; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8065; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8066; CHECK-NEXT:    ret void
8067;
8068; GFX6-LABEL: urem_v2i64_pow2k_denom:
8069; GFX6:       ; %bb.0:
8070; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
8071; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
8072; GFX6-NEXT:    v_mov_b32_e32 v1, 0
8073; GFX6-NEXT:    s_mov_b32 s3, 0xf000
8074; GFX6-NEXT:    s_mov_b32 s2, -1
8075; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8076; GFX6-NEXT:    s_and_b32 s4, s4, 0xfff
8077; GFX6-NEXT:    s_and_b32 s5, s6, 0xfff
8078; GFX6-NEXT:    v_mov_b32_e32 v0, s4
8079; GFX6-NEXT:    v_mov_b32_e32 v2, s5
8080; GFX6-NEXT:    v_mov_b32_e32 v3, v1
8081; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
8082; GFX6-NEXT:    s_endpgm
8083;
8084; GFX9-LABEL: urem_v2i64_pow2k_denom:
8085; GFX9:       ; %bb.0:
8086; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
8087; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8088; GFX9-NEXT:    v_mov_b32_e32 v1, 0
8089; GFX9-NEXT:    v_mov_b32_e32 v3, v1
8090; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8091; GFX9-NEXT:    s_and_b32 s0, s4, 0xfff
8092; GFX9-NEXT:    s_and_b32 s1, s6, 0xfff
8093; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8094; GFX9-NEXT:    v_mov_b32_e32 v2, s1
8095; GFX9-NEXT:    global_store_dwordx4 v1, v[0:3], s[2:3]
8096; GFX9-NEXT:    s_endpgm
8097  %r = urem <2 x i64> %x, <i64 4096, i64 4096>
8098  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
8099  ret void
8100}
8101
8102define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
8103; CHECK-LABEL: @urem_v2i64_pow2_shl_denom(
8104; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
8105; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8106; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
8107; CHECK-NEXT:    [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]]
8108; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
8109; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
8110; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
8111; CHECK-NEXT:    [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]]
8112; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
8113; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8114; CHECK-NEXT:    ret void
8115;
8116; GFX6-LABEL: urem_v2i64_pow2_shl_denom:
8117; GFX6:       ; %bb.0:
8118; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
8119; GFX6-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0xd
8120; GFX6-NEXT:    s_mov_b64 s[12:13], 0x1000
8121; GFX6-NEXT:    s_mov_b32 s11, 0xf000
8122; GFX6-NEXT:    s_mov_b32 s10, -1
8123; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8124; GFX6-NEXT:    s_lshl_b64 s[6:7], s[12:13], s6
8125; GFX6-NEXT:    s_lshl_b64 s[4:5], s[12:13], s4
8126; GFX6-NEXT:    s_add_u32 s4, s4, -1
8127; GFX6-NEXT:    s_addc_u32 s5, s5, -1
8128; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
8129; GFX6-NEXT:    s_add_u32 s4, s6, -1
8130; GFX6-NEXT:    s_addc_u32 s5, s7, -1
8131; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
8132; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8133; GFX6-NEXT:    v_mov_b32_e32 v1, s1
8134; GFX6-NEXT:    v_mov_b32_e32 v2, s2
8135; GFX6-NEXT:    v_mov_b32_e32 v3, s3
8136; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
8137; GFX6-NEXT:    s_endpgm
8138;
8139; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
8140; GFX9:       ; %bb.0:
8141; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
8142; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
8143; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
8144; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8145; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8146; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
8147; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
8148; GFX9-NEXT:    s_add_u32 s2, s2, -1
8149; GFX9-NEXT:    s_addc_u32 s3, s3, -1
8150; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
8151; GFX9-NEXT:    s_add_u32 s4, s10, -1
8152; GFX9-NEXT:    s_addc_u32 s5, s11, -1
8153; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
8154; GFX9-NEXT:    v_mov_b32_e32 v0, s2
8155; GFX9-NEXT:    v_mov_b32_e32 v1, s3
8156; GFX9-NEXT:    v_mov_b32_e32 v2, s4
8157; GFX9-NEXT:    v_mov_b32_e32 v3, s5
8158; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
8159; GFX9-NEXT:    s_endpgm
8160  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
8161  %r = urem <2 x i64> %x, %shl.y
8162  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
8163  ret void
8164}
8165
8166define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
8167; CHECK-LABEL: @sdiv_i64_oddk_denom(
8168; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
8169; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8170; CHECK-NEXT:    ret void
8171;
8172; GFX6-LABEL: sdiv_i64_oddk_denom:
8173; GFX6:       ; %bb.0:
8174; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
8175; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
8176; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
8177; GFX6-NEXT:    s_mov_b32 s5, 0xffed2705
8178; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
8179; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8180; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8181; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8182; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
8183; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8184; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
8185; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8186; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8187; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
8188; GFX6-NEXT:    s_add_u32 s2, s2, s8
8189; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s5
8190; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
8191; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s5
8192; GFX6-NEXT:    s_mov_b32 s9, s8
8193; GFX6-NEXT:    s_addc_u32 s3, s3, s8
8194; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
8195; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
8196; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
8197; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
8198; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
8199; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
8200; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8201; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
8202; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
8203; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
8204; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
8205; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
8206; GFX6-NEXT:    s_mov_b32 s4, s0
8207; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
8208; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
8209; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
8210; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8211; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8212; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8213; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
8214; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s5
8215; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
8216; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fb
8217; GFX6-NEXT:    s_mov_b32 s6, -1
8218; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
8219; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
8220; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
8221; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
8222; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
8223; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
8224; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
8225; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
8226; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
8227; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
8228; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
8229; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8230; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
8231; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
8232; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
8233; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8234; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8235; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8236; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
8237; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
8238; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
8239; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
8240; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
8241; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
8242; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8243; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8244; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
8245; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
8246; GFX6-NEXT:    s_mov_b32 s5, s1
8247; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8248; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
8249; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
8250; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
8251; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
8252; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s0
8253; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s0
8254; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
8255; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s0
8256; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
8257; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
8258; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
8259; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8260; GFX6-NEXT:    v_mov_b32_e32 v5, s3
8261; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s2, v8
8262; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
8263; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v8
8264; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
8265; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
8266; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
8267; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
8268; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
8269; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
8270; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
8271; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
8272; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
8273; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
8274; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
8275; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
8276; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
8277; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
8278; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
8279; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
8280; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
8281; GFX6-NEXT:    v_xor_b32_e32 v1, s8, v1
8282; GFX6-NEXT:    v_mov_b32_e32 v2, s8
8283; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
8284; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
8285; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8286; GFX6-NEXT:    s_endpgm
8287;
8288; GFX9-LABEL: sdiv_i64_oddk_denom:
8289; GFX9:       ; %bb.0:
8290; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
8291; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
8292; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
8293; GFX9-NEXT:    s_mov_b32 s2, 0xffed2705
8294; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
8295; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8296; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8297; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
8298; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8299; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
8300; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8301; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s2
8302; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
8303; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
8304; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
8305; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
8306; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
8307; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
8308; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
8309; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v4
8310; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
8311; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
8312; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
8313; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
8314; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
8315; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
8316; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
8317; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
8318; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8319; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
8320; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
8321; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
8322; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s2
8323; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
8324; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
8325; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8326; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
8327; GFX9-NEXT:    s_add_u32 s0, s6, s2
8328; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
8329; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
8330; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
8331; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v4
8332; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
8333; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v4
8334; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
8335; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
8336; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
8337; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
8338; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
8339; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
8340; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v5, vcc
8341; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
8342; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
8343; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
8344; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
8345; GFX9-NEXT:    s_mov_b32 s3, s2
8346; GFX9-NEXT:    s_addc_u32 s1, s7, s2
8347; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
8348; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
8349; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
8350; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
8351; GFX9-NEXT:    v_mul_hi_u32 v5, s0, v1
8352; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
8353; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
8354; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8355; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
8356; GFX9-NEXT:    v_mul_lo_u32 v5, s1, v0
8357; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
8358; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fb
8359; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8360; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
8361; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
8362; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v6, vcc
8363; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
8364; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
8365; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
8366; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s3
8367; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s3
8368; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s3
8369; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
8370; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
8371; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
8372; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
8373; GFX9-NEXT:    v_mov_b32_e32 v6, s1
8374; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s0, v9
8375; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
8376; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s3, v9
8377; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
8378; GFX9-NEXT:    s_mov_b32 s0, 0x12d8fa
8379; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
8380; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
8381; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
8382; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
8383; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
8384; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
8385; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
8386; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
8387; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v6, s[0:1]
8388; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
8389; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
8390; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
8391; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
8392; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
8393; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
8394; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
8395; GFX9-NEXT:    v_mov_b32_e32 v2, s2
8396; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
8397; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
8398; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
8399; GFX9-NEXT:    s_endpgm
8400  %r = sdiv i64 %x, 1235195
8401  store i64 %r, i64 addrspace(1)* %out
8402  ret void
8403}
8404
8405define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
8406; CHECK-LABEL: @sdiv_i64_pow2k_denom(
8407; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
8408; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8409; CHECK-NEXT:    ret void
8410;
8411; GFX6-LABEL: sdiv_i64_pow2k_denom:
8412; GFX6:       ; %bb.0:
8413; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
8414; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8415; GFX6-NEXT:    s_mov_b32 s6, -1
8416; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8417; GFX6-NEXT:    s_mov_b32 s4, s0
8418; GFX6-NEXT:    s_ashr_i32 s0, s3, 31
8419; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
8420; GFX6-NEXT:    s_add_u32 s0, s2, s0
8421; GFX6-NEXT:    s_mov_b32 s5, s1
8422; GFX6-NEXT:    s_addc_u32 s1, s3, 0
8423; GFX6-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
8424; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8425; GFX6-NEXT:    v_mov_b32_e32 v1, s1
8426; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8427; GFX6-NEXT:    s_endpgm
8428;
8429; GFX9-LABEL: sdiv_i64_pow2k_denom:
8430; GFX9:       ; %bb.0:
8431; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
8432; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8433; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8434; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
8435; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
8436; GFX9-NEXT:    s_add_u32 s2, s2, s4
8437; GFX9-NEXT:    s_addc_u32 s3, s3, 0
8438; GFX9-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
8439; GFX9-NEXT:    v_mov_b32_e32 v0, s2
8440; GFX9-NEXT:    v_mov_b32_e32 v1, s3
8441; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
8442; GFX9-NEXT:    s_endpgm
8443  %r = sdiv i64 %x, 4096
8444  store i64 %r, i64 addrspace(1)* %out
8445  ret void
8446}
8447
8448define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
8449; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
8450; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
8451; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
8452; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8453; CHECK-NEXT:    ret void
8454;
8455; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
8456; GFX6:       ; %bb.0:
8457; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
8458; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
8459; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8460; GFX6-NEXT:    s_mov_b32 s6, -1
8461; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8462; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
8463; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
8464; GFX6-NEXT:    s_add_u32 s2, s2, s8
8465; GFX6-NEXT:    s_mov_b32 s9, s8
8466; GFX6-NEXT:    s_addc_u32 s3, s3, s8
8467; GFX6-NEXT:    s_xor_b64 s[10:11], s[2:3], s[8:9]
8468; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s10
8469; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s11
8470; GFX6-NEXT:    s_sub_u32 s4, 0, s10
8471; GFX6-NEXT:    s_subb_u32 s5, 0, s11
8472; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
8473; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
8474; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
8475; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8476; GFX6-NEXT:    s_ashr_i32 s12, s3, 31
8477; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8478; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8479; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
8480; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8481; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
8482; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8483; GFX6-NEXT:    s_add_u32 s2, s2, s12
8484; GFX6-NEXT:    s_mov_b32 s13, s12
8485; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
8486; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
8487; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
8488; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
8489; GFX6-NEXT:    s_addc_u32 s3, s3, s12
8490; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
8491; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
8492; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
8493; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
8494; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
8495; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
8496; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8497; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
8498; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
8499; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
8500; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
8501; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
8502; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
8503; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
8504; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
8505; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8506; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8507; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8508; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
8509; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
8510; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
8511; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
8512; GFX6-NEXT:    s_mov_b32 s5, s1
8513; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8514; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
8515; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
8516; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
8517; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
8518; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
8519; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
8520; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
8521; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
8522; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
8523; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
8524; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8525; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
8526; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
8527; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
8528; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8529; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8530; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8531; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
8532; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
8533; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
8534; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
8535; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
8536; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
8537; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8538; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8539; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
8540; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
8541; GFX6-NEXT:    s_mov_b32 s4, s0
8542; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8543; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
8544; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
8545; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
8546; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
8547; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
8548; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
8549; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
8550; GFX6-NEXT:    v_mov_b32_e32 v5, s11
8551; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
8552; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v0
8553; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
8554; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
8555; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
8556; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
8557; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s10, v3
8558; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
8559; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v4
8560; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
8561; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v5
8562; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
8563; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v4
8564; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
8565; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
8566; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
8567; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
8568; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
8569; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
8570; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
8571; GFX6-NEXT:    v_mov_b32_e32 v6, s3
8572; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
8573; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
8574; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
8575; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
8576; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
8577; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v2
8578; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
8579; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
8580; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
8581; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
8582; GFX6-NEXT:    s_xor_b64 s[0:1], s[12:13], s[8:9]
8583; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
8584; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
8585; GFX6-NEXT:    v_xor_b32_e32 v1, s1, v1
8586; GFX6-NEXT:    v_mov_b32_e32 v2, s1
8587; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
8588; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
8589; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8590; GFX6-NEXT:    s_endpgm
8591;
8592; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
8593; GFX9:       ; %bb.0:
8594; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
8595; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
8596; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8597; GFX9-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
8598; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
8599; GFX9-NEXT:    s_add_u32 s4, s4, s2
8600; GFX9-NEXT:    s_mov_b32 s3, s2
8601; GFX9-NEXT:    s_addc_u32 s5, s5, s2
8602; GFX9-NEXT:    s_xor_b64 s[8:9], s[4:5], s[2:3]
8603; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
8604; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
8605; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
8606; GFX9-NEXT:    s_sub_u32 s0, 0, s8
8607; GFX9-NEXT:    s_subb_u32 s1, 0, s9
8608; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
8609; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
8610; GFX9-NEXT:    v_mov_b32_e32 v0, 0
8611; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
8612; GFX9-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
8613; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
8614; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
8615; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
8616; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
8617; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
8618; GFX9-NEXT:    v_readfirstlane_b32 s11, v1
8619; GFX9-NEXT:    s_mul_i32 s12, s0, s10
8620; GFX9-NEXT:    s_mul_hi_u32 s14, s0, s11
8621; GFX9-NEXT:    s_mul_i32 s13, s1, s11
8622; GFX9-NEXT:    s_add_i32 s12, s14, s12
8623; GFX9-NEXT:    s_add_i32 s12, s12, s13
8624; GFX9-NEXT:    s_mul_i32 s15, s0, s11
8625; GFX9-NEXT:    s_mul_hi_u32 s13, s11, s12
8626; GFX9-NEXT:    s_mul_i32 s14, s11, s12
8627; GFX9-NEXT:    s_mul_hi_u32 s11, s11, s15
8628; GFX9-NEXT:    s_add_u32 s11, s11, s14
8629; GFX9-NEXT:    s_addc_u32 s13, 0, s13
8630; GFX9-NEXT:    s_mul_hi_u32 s16, s10, s15
8631; GFX9-NEXT:    s_mul_i32 s15, s10, s15
8632; GFX9-NEXT:    s_add_u32 s11, s11, s15
8633; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s12
8634; GFX9-NEXT:    s_addc_u32 s11, s13, s16
8635; GFX9-NEXT:    s_addc_u32 s13, s14, 0
8636; GFX9-NEXT:    s_mul_i32 s12, s10, s12
8637; GFX9-NEXT:    s_add_u32 s11, s11, s12
8638; GFX9-NEXT:    s_addc_u32 s12, 0, s13
8639; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s11, v1
8640; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8641; GFX9-NEXT:    s_addc_u32 s10, s10, s12
8642; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
8643; GFX9-NEXT:    s_mul_i32 s11, s0, s10
8644; GFX9-NEXT:    s_mul_hi_u32 s13, s0, s12
8645; GFX9-NEXT:    s_add_i32 s11, s13, s11
8646; GFX9-NEXT:    s_mul_i32 s1, s1, s12
8647; GFX9-NEXT:    s_add_i32 s11, s11, s1
8648; GFX9-NEXT:    s_mul_i32 s0, s0, s12
8649; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s0
8650; GFX9-NEXT:    s_mul_i32 s14, s10, s0
8651; GFX9-NEXT:    s_mul_i32 s16, s12, s11
8652; GFX9-NEXT:    s_mul_hi_u32 s0, s12, s0
8653; GFX9-NEXT:    s_mul_hi_u32 s15, s12, s11
8654; GFX9-NEXT:    s_add_u32 s0, s0, s16
8655; GFX9-NEXT:    s_addc_u32 s12, 0, s15
8656; GFX9-NEXT:    s_add_u32 s0, s0, s14
8657; GFX9-NEXT:    s_mul_hi_u32 s1, s10, s11
8658; GFX9-NEXT:    s_addc_u32 s0, s12, s13
8659; GFX9-NEXT:    s_addc_u32 s1, s1, 0
8660; GFX9-NEXT:    s_mul_i32 s11, s10, s11
8661; GFX9-NEXT:    s_add_u32 s0, s0, s11
8662; GFX9-NEXT:    s_addc_u32 s1, 0, s1
8663; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
8664; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8665; GFX9-NEXT:    s_addc_u32 s12, s10, s1
8666; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8667; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
8668; GFX9-NEXT:    s_add_u32 s0, s6, s10
8669; GFX9-NEXT:    s_mov_b32 s11, s10
8670; GFX9-NEXT:    s_addc_u32 s1, s7, s10
8671; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
8672; GFX9-NEXT:    v_readfirstlane_b32 s13, v1
8673; GFX9-NEXT:    s_mul_i32 s1, s6, s12
8674; GFX9-NEXT:    s_mul_hi_u32 s14, s6, s13
8675; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s12
8676; GFX9-NEXT:    s_add_u32 s1, s14, s1
8677; GFX9-NEXT:    s_addc_u32 s0, 0, s0
8678; GFX9-NEXT:    s_mul_hi_u32 s15, s7, s13
8679; GFX9-NEXT:    s_mul_i32 s13, s7, s13
8680; GFX9-NEXT:    s_add_u32 s1, s1, s13
8681; GFX9-NEXT:    s_mul_hi_u32 s14, s7, s12
8682; GFX9-NEXT:    s_addc_u32 s0, s0, s15
8683; GFX9-NEXT:    s_addc_u32 s1, s14, 0
8684; GFX9-NEXT:    s_mul_i32 s12, s7, s12
8685; GFX9-NEXT:    s_add_u32 s12, s0, s12
8686; GFX9-NEXT:    s_addc_u32 s13, 0, s1
8687; GFX9-NEXT:    s_mul_i32 s0, s8, s13
8688; GFX9-NEXT:    s_mul_hi_u32 s1, s8, s12
8689; GFX9-NEXT:    s_add_i32 s0, s1, s0
8690; GFX9-NEXT:    s_mul_i32 s1, s9, s12
8691; GFX9-NEXT:    s_add_i32 s14, s0, s1
8692; GFX9-NEXT:    s_mul_i32 s1, s8, s12
8693; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8694; GFX9-NEXT:    s_sub_i32 s0, s7, s14
8695; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s6, v1
8696; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8697; GFX9-NEXT:    s_subb_u32 s6, s0, s9
8698; GFX9-NEXT:    v_subrev_co_u32_e64 v2, s[0:1], s8, v1
8699; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
8700; GFX9-NEXT:    s_subb_u32 s6, s6, 0
8701; GFX9-NEXT:    s_cmp_ge_u32 s6, s9
8702; GFX9-NEXT:    s_cselect_b32 s15, -1, 0
8703; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
8704; GFX9-NEXT:    s_cmp_eq_u32 s6, s9
8705; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
8706; GFX9-NEXT:    v_mov_b32_e32 v3, s15
8707; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
8708; GFX9-NEXT:    s_add_u32 s6, s12, 2
8709; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
8710; GFX9-NEXT:    s_addc_u32 s0, s13, 0
8711; GFX9-NEXT:    s_add_u32 s15, s12, 1
8712; GFX9-NEXT:    s_addc_u32 s1, s13, 0
8713; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8714; GFX9-NEXT:    s_subb_u32 s7, s7, s14
8715; GFX9-NEXT:    s_cmp_ge_u32 s7, s9
8716; GFX9-NEXT:    v_mov_b32_e32 v3, s1
8717; GFX9-NEXT:    v_mov_b32_e32 v4, s0
8718; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
8719; GFX9-NEXT:    s_cselect_b32 s14, -1, 0
8720; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
8721; GFX9-NEXT:    s_cmp_eq_u32 s7, s9
8722; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
8723; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
8724; GFX9-NEXT:    v_mov_b32_e32 v3, s14
8725; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
8726; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
8727; GFX9-NEXT:    v_mov_b32_e32 v3, s13
8728; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
8729; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
8730; GFX9-NEXT:    v_mov_b32_e32 v2, s15
8731; GFX9-NEXT:    v_mov_b32_e32 v3, s6
8732; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
8733; GFX9-NEXT:    v_mov_b32_e32 v3, s12
8734; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
8735; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[2:3]
8736; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v2
8737; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v1
8738; GFX9-NEXT:    v_mov_b32_e32 v4, s1
8739; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s0, v2
8740; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v4, vcc
8741; GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[4:5]
8742; GFX9-NEXT:    s_endpgm
8743  %shl.y = shl i64 4096, %y
8744  %r = sdiv i64 %x, %shl.y
8745  store i64 %r, i64 addrspace(1)* %out
8746  ret void
8747}
8748
8749define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
8750; CHECK-LABEL: @sdiv_v2i64_pow2k_denom(
8751; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8752; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
8753; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
8754; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8755; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096
8756; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8757; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8758; CHECK-NEXT:    ret void
8759;
8760; GFX6-LABEL: sdiv_v2i64_pow2k_denom:
8761; GFX6:       ; %bb.0:
8762; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
8763; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
8764; GFX6-NEXT:    s_mov_b32 s3, 0xf000
8765; GFX6-NEXT:    s_mov_b32 s2, -1
8766; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8767; GFX6-NEXT:    s_ashr_i32 s8, s5, 31
8768; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
8769; GFX6-NEXT:    s_add_u32 s4, s4, s8
8770; GFX6-NEXT:    s_addc_u32 s5, s5, 0
8771; GFX6-NEXT:    s_ashr_i32 s8, s7, 31
8772; GFX6-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
8773; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
8774; GFX6-NEXT:    s_add_u32 s6, s6, s8
8775; GFX6-NEXT:    s_addc_u32 s7, s7, 0
8776; GFX6-NEXT:    s_ashr_i64 s[6:7], s[6:7], 12
8777; GFX6-NEXT:    v_mov_b32_e32 v0, s4
8778; GFX6-NEXT:    v_mov_b32_e32 v1, s5
8779; GFX6-NEXT:    v_mov_b32_e32 v2, s6
8780; GFX6-NEXT:    v_mov_b32_e32 v3, s7
8781; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
8782; GFX6-NEXT:    s_endpgm
8783;
8784; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
8785; GFX9:       ; %bb.0:
8786; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
8787; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8788; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8789; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8790; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
8791; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
8792; GFX9-NEXT:    s_add_u32 s0, s4, s0
8793; GFX9-NEXT:    s_addc_u32 s1, s5, 0
8794; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
8795; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
8796; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
8797; GFX9-NEXT:    s_add_u32 s4, s6, s4
8798; GFX9-NEXT:    s_addc_u32 s5, s7, 0
8799; GFX9-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
8800; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8801; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8802; GFX9-NEXT:    v_mov_b32_e32 v2, s4
8803; GFX9-NEXT:    v_mov_b32_e32 v3, s5
8804; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
8805; GFX9-NEXT:    s_endpgm
8806  %r = sdiv <2 x i64> %x, <i64 4096, i64 4096>
8807  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
8808  ret void
8809}
8810
8811define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
8812; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom(
8813; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8814; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
8815; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
8816; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8817; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095
8818; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8819; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8820; CHECK-NEXT:    ret void
8821;
8822; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
8823; GFX6:       ; %bb.0:
8824; GFX6-NEXT:    v_mov_b32_e32 v0, 0x457ff000
8825; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
8826; GFX6-NEXT:    v_mac_f32_e32 v0, 0, v1
8827; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
8828; GFX6-NEXT:    s_movk_i32 s6, 0xf001
8829; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8830; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
8831; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8832; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8833; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8834; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
8835; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8836; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8837; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
8838; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8839; GFX6-NEXT:    s_ashr_i32 s8, s1, 31
8840; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
8841; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
8842; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s6
8843; GFX6-NEXT:    s_add_u32 s0, s0, s8
8844; GFX6-NEXT:    s_addc_u32 s1, s1, 0
8845; GFX6-NEXT:    s_ashr_i64 s[8:9], s[0:1], 12
8846; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8847; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
8848; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
8849; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
8850; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
8851; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
8852; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
8853; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8854; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8855; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
8856; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
8857; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
8858; GFX6-NEXT:    s_ashr_i32 s10, s3, 31
8859; GFX6-NEXT:    s_add_u32 s0, s2, s10
8860; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
8861; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
8862; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
8863; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8864; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8865; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8866; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
8867; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s6
8868; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s6
8869; GFX6-NEXT:    s_mov_b32 s11, s10
8870; GFX6-NEXT:    s_addc_u32 s1, s3, s10
8871; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[10:11]
8872; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
8873; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
8874; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
8875; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
8876; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
8877; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
8878; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
8879; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
8880; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
8881; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
8882; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
8883; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8884; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
8885; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
8886; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
8887; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8888; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8889; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8890; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
8891; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
8892; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
8893; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v1
8894; GFX6-NEXT:    v_mul_hi_u32 v5, s1, v1
8895; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
8896; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8897; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8898; GFX6-NEXT:    v_mul_lo_u32 v4, s1, v0
8899; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
8900; GFX6-NEXT:    s_movk_i32 s2, 0xfff
8901; GFX6-NEXT:    s_mov_b32 s6, -1
8902; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8903; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
8904; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
8905; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
8906; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
8907; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s2
8908; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s2
8909; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
8910; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s2
8911; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
8912; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
8913; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
8914; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8915; GFX6-NEXT:    v_mov_b32_e32 v5, s1
8916; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
8917; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
8918; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s2, v8
8919; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
8920; GFX6-NEXT:    s_movk_i32 s0, 0xffe
8921; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
8922; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
8923; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
8924; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
8925; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
8926; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
8927; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
8928; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
8929; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
8930; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
8931; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
8932; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
8933; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
8934; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
8935; GFX6-NEXT:    v_xor_b32_e32 v0, s10, v0
8936; GFX6-NEXT:    v_xor_b32_e32 v1, s10, v1
8937; GFX6-NEXT:    v_mov_b32_e32 v3, s10
8938; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s10, v0
8939; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
8940; GFX6-NEXT:    v_mov_b32_e32 v0, s8
8941; GFX6-NEXT:    v_mov_b32_e32 v1, s9
8942; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
8943; GFX6-NEXT:    s_endpgm
8944;
8945; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
8946; GFX9:       ; %bb.0:
8947; GFX9-NEXT:    v_mov_b32_e32 v0, 0x457ff000
8948; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
8949; GFX9-NEXT:    v_mac_f32_e32 v0, 0, v1
8950; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
8951; GFX9-NEXT:    s_movk_i32 s8, 0xf001
8952; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
8953; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8954; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8955; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8956; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
8957; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8958; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8959; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
8960; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8961; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
8962; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
8963; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s8
8964; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s8
8965; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
8966; GFX9-NEXT:    s_add_u32 s0, s4, s0
8967; GFX9-NEXT:    s_addc_u32 s1, s5, 0
8968; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
8969; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
8970; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
8971; GFX9-NEXT:    v_mul_hi_u32 v5, v0, v4
8972; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
8973; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
8974; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
8975; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
8976; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
8977; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v4
8978; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
8979; GFX9-NEXT:    s_ashr_i64 s[4:5], s[0:1], 12
8980; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
8981; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
8982; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
8983; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8984; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
8985; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
8986; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
8987; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
8988; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
8989; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
8990; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
8991; GFX9-NEXT:    s_add_u32 s0, s6, s8
8992; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
8993; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
8994; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
8995; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v4
8996; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
8997; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v4
8998; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
8999; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
9000; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
9001; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
9002; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
9003; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
9004; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v5, vcc
9005; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
9006; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
9007; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
9008; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
9009; GFX9-NEXT:    s_mov_b32 s9, s8
9010; GFX9-NEXT:    s_addc_u32 s1, s7, s8
9011; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
9012; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
9013; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
9014; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
9015; GFX9-NEXT:    v_mul_hi_u32 v5, s0, v1
9016; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
9017; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
9018; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9019; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
9020; GFX9-NEXT:    v_mul_lo_u32 v5, s1, v0
9021; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
9022; GFX9-NEXT:    s_movk_i32 s6, 0xfff
9023; GFX9-NEXT:    v_mov_b32_e32 v4, 0
9024; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
9025; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
9026; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v6, vcc
9027; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
9028; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
9029; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
9030; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s6
9031; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s6
9032; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s6
9033; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
9034; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
9035; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
9036; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
9037; GFX9-NEXT:    v_mov_b32_e32 v6, s1
9038; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s0, v9
9039; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
9040; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s6, v9
9041; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
9042; GFX9-NEXT:    s_movk_i32 s0, 0xffe
9043; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
9044; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9045; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
9046; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
9047; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
9048; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
9049; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
9050; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
9051; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v6, s[0:1]
9052; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
9053; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
9054; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
9055; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
9056; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
9057; GFX9-NEXT:    v_xor_b32_e32 v0, s8, v0
9058; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
9059; GFX9-NEXT:    v_mov_b32_e32 v3, s8
9060; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s8, v0
9061; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
9062; GFX9-NEXT:    v_mov_b32_e32 v0, s4
9063; GFX9-NEXT:    v_mov_b32_e32 v1, s5
9064; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
9065; GFX9-NEXT:    s_endpgm
9066  %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
9067  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
9068  ret void
9069}
9070
9071define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
9072; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom(
9073; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
9074; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
9075; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
9076; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]]
9077; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
9078; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
9079; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
9080; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]]
9081; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
9082; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
9083; CHECK-NEXT:    ret void
9084;
9085; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom:
9086; GFX6:       ; %bb.0:
9087; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
9088; GFX6-NEXT:    s_mov_b64 s[12:13], 0x1000
9089; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9090; GFX6-NEXT:    s_lshl_b64 s[8:9], s[12:13], s8
9091; GFX6-NEXT:    s_lshl_b64 s[2:3], s[12:13], s10
9092; GFX6-NEXT:    s_ashr_i32 s14, s9, 31
9093; GFX6-NEXT:    s_add_u32 s8, s8, s14
9094; GFX6-NEXT:    s_mov_b32 s15, s14
9095; GFX6-NEXT:    s_addc_u32 s9, s9, s14
9096; GFX6-NEXT:    s_xor_b64 s[12:13], s[8:9], s[14:15]
9097; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
9098; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s13
9099; GFX6-NEXT:    s_sub_u32 s10, 0, s12
9100; GFX6-NEXT:    s_subb_u32 s11, 0, s13
9101; GFX6-NEXT:    s_ashr_i32 s16, s5, 31
9102; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
9103; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9104; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
9105; GFX6-NEXT:    s_add_u32 s0, s4, s16
9106; GFX6-NEXT:    s_mov_b32 s17, s16
9107; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9108; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9109; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9110; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9111; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9112; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9113; GFX6-NEXT:    s_addc_u32 s1, s5, s16
9114; GFX6-NEXT:    s_xor_b64 s[4:5], s[0:1], s[16:17]
9115; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
9116; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
9117; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
9118; GFX6-NEXT:    v_mul_lo_u32 v4, s10, v0
9119; GFX6-NEXT:    s_xor_b64 s[14:15], s[16:17], s[14:15]
9120; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9121; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
9122; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
9123; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
9124; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
9125; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
9126; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
9127; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
9128; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
9129; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
9130; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9131; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
9132; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
9133; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
9134; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9135; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9136; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9137; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9138; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
9139; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
9140; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
9141; GFX6-NEXT:    s_mov_b32 s11, 0xf000
9142; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9143; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v0
9144; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
9145; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
9146; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
9147; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
9148; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
9149; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
9150; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
9151; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
9152; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
9153; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9154; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
9155; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
9156; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
9157; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9158; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9159; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9160; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9161; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
9162; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
9163; GFX6-NEXT:    v_mul_hi_u32 v4, s4, v1
9164; GFX6-NEXT:    v_mul_hi_u32 v5, s5, v1
9165; GFX6-NEXT:    v_mul_lo_u32 v1, s5, v1
9166; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9167; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9168; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
9169; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
9170; GFX6-NEXT:    s_mov_b32 s10, -1
9171; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9172; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
9173; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
9174; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
9175; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
9176; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v1
9177; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v0
9178; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v0
9179; GFX6-NEXT:    v_mov_b32_e32 v5, s13
9180; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9181; GFX6-NEXT:    v_mul_lo_u32 v3, s12, v0
9182; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
9183; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v2
9184; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
9185; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
9186; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s12, v3
9187; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
9188; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
9189; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
9190; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v5
9191; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
9192; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v4
9193; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
9194; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
9195; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
9196; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
9197; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
9198; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
9199; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
9200; GFX6-NEXT:    s_add_u32 s2, s2, s4
9201; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
9202; GFX6-NEXT:    v_mov_b32_e32 v6, s5
9203; GFX6-NEXT:    s_mov_b32 s5, s4
9204; GFX6-NEXT:    s_addc_u32 s3, s3, s4
9205; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
9206; GFX6-NEXT:    v_cvt_f32_u32_e32 v8, s2
9207; GFX6-NEXT:    v_cvt_f32_u32_e32 v9, s3
9208; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
9209; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
9210; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9211; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
9212; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
9213; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
9214; GFX6-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
9215; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
9216; GFX6-NEXT:    v_rcp_f32_e32 v3, v8
9217; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
9218; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
9219; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
9220; GFX6-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
9221; GFX6-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
9222; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
9223; GFX6-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
9224; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
9225; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
9226; GFX6-NEXT:    s_sub_u32 s0, 0, s2
9227; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
9228; GFX6-NEXT:    v_mul_hi_u32 v2, s0, v3
9229; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
9230; GFX6-NEXT:    s_subb_u32 s1, 0, s3
9231; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v3
9232; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
9233; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
9234; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
9235; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
9236; GFX6-NEXT:    v_mul_lo_u32 v6, v3, v2
9237; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
9238; GFX6-NEXT:    v_mul_hi_u32 v8, v3, v2
9239; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v2
9240; GFX6-NEXT:    v_mul_lo_u32 v2, v4, v2
9241; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
9242; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
9243; GFX6-NEXT:    v_mul_lo_u32 v8, v4, v5
9244; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
9245; GFX6-NEXT:    s_mov_b32 s13, s12
9246; GFX6-NEXT:    v_xor_b32_e32 v0, s14, v0
9247; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
9248; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
9249; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
9250; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
9251; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
9252; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9253; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
9254; GFX6-NEXT:    v_mul_lo_u32 v4, s0, v3
9255; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v2
9256; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v2
9257; GFX6-NEXT:    v_xor_b32_e32 v1, s15, v1
9258; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9259; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v2
9260; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
9261; GFX6-NEXT:    v_mul_lo_u32 v8, v2, v4
9262; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
9263; GFX6-NEXT:    v_mul_hi_u32 v10, v2, v4
9264; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
9265; GFX6-NEXT:    v_mul_lo_u32 v5, v3, v5
9266; GFX6-NEXT:    v_mul_hi_u32 v6, v3, v4
9267; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
9268; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
9269; GFX6-NEXT:    v_mul_lo_u32 v4, v3, v4
9270; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
9271; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
9272; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
9273; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9274; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
9275; GFX6-NEXT:    s_add_u32 s0, s6, s12
9276; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9277; GFX6-NEXT:    s_addc_u32 s1, s7, s12
9278; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
9279; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
9280; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v3
9281; GFX6-NEXT:    v_mul_hi_u32 v5, s6, v2
9282; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v3
9283; GFX6-NEXT:    v_mul_hi_u32 v8, s7, v3
9284; GFX6-NEXT:    v_mul_lo_u32 v3, s7, v3
9285; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9286; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
9287; GFX6-NEXT:    v_mul_lo_u32 v7, s7, v2
9288; GFX6-NEXT:    v_mul_hi_u32 v2, s7, v2
9289; GFX6-NEXT:    v_mov_b32_e32 v6, s15
9290; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
9291; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
9292; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
9293; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9294; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9295; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v3
9296; GFX6-NEXT:    v_mul_hi_u32 v5, s2, v2
9297; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v0
9298; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
9299; GFX6-NEXT:    v_mul_lo_u32 v6, s3, v2
9300; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9301; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
9302; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
9303; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s7, v4
9304; GFX6-NEXT:    v_mov_b32_e32 v7, s3
9305; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s6, v5
9306; GFX6-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
9307; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s2, v5
9308; GFX6-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
9309; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v6
9310; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
9311; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v7
9312; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
9313; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v6
9314; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
9315; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
9316; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
9317; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
9318; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
9319; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
9320; GFX6-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
9321; GFX6-NEXT:    v_mov_b32_e32 v8, s7
9322; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
9323; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v4
9324; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
9325; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v5
9326; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
9327; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v4
9328; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
9329; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
9330; GFX6-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
9331; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
9332; GFX6-NEXT:    s_xor_b64 s[0:1], s[12:13], s[4:5]
9333; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
9334; GFX6-NEXT:    v_xor_b32_e32 v2, s0, v2
9335; GFX6-NEXT:    v_xor_b32_e32 v3, s1, v3
9336; GFX6-NEXT:    v_mov_b32_e32 v4, s1
9337; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v2
9338; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
9339; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9340; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
9341; GFX6-NEXT:    s_endpgm
9342;
9343; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
9344; GFX9:       ; %bb.0:
9345; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
9346; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
9347; GFX9-NEXT:    v_mov_b32_e32 v4, 0
9348; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9349; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
9350; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
9351; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
9352; GFX9-NEXT:    s_add_u32 s2, s2, s8
9353; GFX9-NEXT:    s_mov_b32 s9, s8
9354; GFX9-NEXT:    s_addc_u32 s3, s3, s8
9355; GFX9-NEXT:    s_xor_b64 s[12:13], s[2:3], s[8:9]
9356; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
9357; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
9358; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
9359; GFX9-NEXT:    s_sub_u32 s0, 0, s12
9360; GFX9-NEXT:    s_subb_u32 s1, 0, s13
9361; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
9362; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
9363; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9364; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9365; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
9366; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9367; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9368; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
9369; GFX9-NEXT:    v_readfirstlane_b32 s14, v1
9370; GFX9-NEXT:    v_readfirstlane_b32 s15, v0
9371; GFX9-NEXT:    s_mul_i32 s16, s0, s14
9372; GFX9-NEXT:    s_mul_hi_u32 s18, s0, s15
9373; GFX9-NEXT:    s_mul_i32 s17, s1, s15
9374; GFX9-NEXT:    s_add_i32 s16, s18, s16
9375; GFX9-NEXT:    s_add_i32 s16, s16, s17
9376; GFX9-NEXT:    s_mul_i32 s19, s0, s15
9377; GFX9-NEXT:    s_mul_hi_u32 s17, s15, s16
9378; GFX9-NEXT:    s_mul_i32 s18, s15, s16
9379; GFX9-NEXT:    s_mul_hi_u32 s15, s15, s19
9380; GFX9-NEXT:    s_add_u32 s15, s15, s18
9381; GFX9-NEXT:    s_addc_u32 s17, 0, s17
9382; GFX9-NEXT:    s_mul_hi_u32 s20, s14, s19
9383; GFX9-NEXT:    s_mul_i32 s19, s14, s19
9384; GFX9-NEXT:    s_add_u32 s15, s15, s19
9385; GFX9-NEXT:    s_mul_hi_u32 s18, s14, s16
9386; GFX9-NEXT:    s_addc_u32 s15, s17, s20
9387; GFX9-NEXT:    s_addc_u32 s17, s18, 0
9388; GFX9-NEXT:    s_mul_i32 s16, s14, s16
9389; GFX9-NEXT:    s_add_u32 s15, s15, s16
9390; GFX9-NEXT:    s_addc_u32 s16, 0, s17
9391; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s15, v0
9392; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9393; GFX9-NEXT:    s_addc_u32 s14, s14, s16
9394; GFX9-NEXT:    v_readfirstlane_b32 s16, v0
9395; GFX9-NEXT:    s_mul_i32 s15, s0, s14
9396; GFX9-NEXT:    s_mul_hi_u32 s17, s0, s16
9397; GFX9-NEXT:    s_add_i32 s15, s17, s15
9398; GFX9-NEXT:    s_mul_i32 s1, s1, s16
9399; GFX9-NEXT:    s_add_i32 s15, s15, s1
9400; GFX9-NEXT:    s_mul_i32 s0, s0, s16
9401; GFX9-NEXT:    s_mul_hi_u32 s17, s14, s0
9402; GFX9-NEXT:    s_mul_i32 s18, s14, s0
9403; GFX9-NEXT:    s_mul_i32 s20, s16, s15
9404; GFX9-NEXT:    s_mul_hi_u32 s0, s16, s0
9405; GFX9-NEXT:    s_mul_hi_u32 s19, s16, s15
9406; GFX9-NEXT:    s_add_u32 s0, s0, s20
9407; GFX9-NEXT:    s_addc_u32 s16, 0, s19
9408; GFX9-NEXT:    s_add_u32 s0, s0, s18
9409; GFX9-NEXT:    s_mul_hi_u32 s1, s14, s15
9410; GFX9-NEXT:    s_addc_u32 s0, s16, s17
9411; GFX9-NEXT:    s_addc_u32 s1, s1, 0
9412; GFX9-NEXT:    s_mul_i32 s15, s14, s15
9413; GFX9-NEXT:    s_add_u32 s0, s0, s15
9414; GFX9-NEXT:    s_addc_u32 s1, 0, s1
9415; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
9416; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9417; GFX9-NEXT:    s_addc_u32 s16, s14, s1
9418; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
9419; GFX9-NEXT:    s_add_u32 s0, s4, s14
9420; GFX9-NEXT:    s_mov_b32 s15, s14
9421; GFX9-NEXT:    s_addc_u32 s1, s5, s14
9422; GFX9-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
9423; GFX9-NEXT:    v_readfirstlane_b32 s17, v0
9424; GFX9-NEXT:    s_mul_i32 s1, s4, s16
9425; GFX9-NEXT:    s_mul_hi_u32 s18, s4, s17
9426; GFX9-NEXT:    s_mul_hi_u32 s0, s4, s16
9427; GFX9-NEXT:    s_add_u32 s1, s18, s1
9428; GFX9-NEXT:    s_addc_u32 s0, 0, s0
9429; GFX9-NEXT:    s_mul_hi_u32 s19, s5, s17
9430; GFX9-NEXT:    s_mul_i32 s17, s5, s17
9431; GFX9-NEXT:    s_add_u32 s1, s1, s17
9432; GFX9-NEXT:    s_mul_hi_u32 s18, s5, s16
9433; GFX9-NEXT:    s_addc_u32 s0, s0, s19
9434; GFX9-NEXT:    s_addc_u32 s1, s18, 0
9435; GFX9-NEXT:    s_mul_i32 s16, s5, s16
9436; GFX9-NEXT:    s_add_u32 s16, s0, s16
9437; GFX9-NEXT:    s_addc_u32 s17, 0, s1
9438; GFX9-NEXT:    s_mul_i32 s0, s12, s17
9439; GFX9-NEXT:    s_mul_hi_u32 s1, s12, s16
9440; GFX9-NEXT:    s_add_i32 s0, s1, s0
9441; GFX9-NEXT:    s_mul_i32 s1, s13, s16
9442; GFX9-NEXT:    s_add_i32 s18, s0, s1
9443; GFX9-NEXT:    s_mul_i32 s1, s12, s16
9444; GFX9-NEXT:    v_mov_b32_e32 v0, s1
9445; GFX9-NEXT:    s_sub_i32 s0, s5, s18
9446; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
9447; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9448; GFX9-NEXT:    s_subb_u32 s4, s0, s13
9449; GFX9-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s12, v0
9450; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
9451; GFX9-NEXT:    s_subb_u32 s4, s4, 0
9452; GFX9-NEXT:    s_cmp_ge_u32 s4, s13
9453; GFX9-NEXT:    s_cselect_b32 s19, -1, 0
9454; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v1
9455; GFX9-NEXT:    s_cmp_eq_u32 s4, s13
9456; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
9457; GFX9-NEXT:    v_mov_b32_e32 v2, s19
9458; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
9459; GFX9-NEXT:    s_add_u32 s4, s16, 2
9460; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[0:1]
9461; GFX9-NEXT:    s_addc_u32 s0, s17, 0
9462; GFX9-NEXT:    s_add_u32 s19, s16, 1
9463; GFX9-NEXT:    s_addc_u32 s1, s17, 0
9464; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9465; GFX9-NEXT:    s_subb_u32 s5, s5, s18
9466; GFX9-NEXT:    s_cmp_ge_u32 s5, s13
9467; GFX9-NEXT:    v_mov_b32_e32 v2, s1
9468; GFX9-NEXT:    v_mov_b32_e32 v3, s0
9469; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
9470; GFX9-NEXT:    s_cselect_b32 s18, -1, 0
9471; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
9472; GFX9-NEXT:    s_cmp_eq_u32 s5, s13
9473; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
9474; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
9475; GFX9-NEXT:    v_mov_b32_e32 v2, s18
9476; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
9477; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
9478; GFX9-NEXT:    v_mov_b32_e32 v2, s17
9479; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
9480; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
9481; GFX9-NEXT:    v_mov_b32_e32 v1, s19
9482; GFX9-NEXT:    v_mov_b32_e32 v2, s4
9483; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
9484; GFX9-NEXT:    s_xor_b64 s[0:1], s[14:15], s[8:9]
9485; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
9486; GFX9-NEXT:    s_add_u32 s8, s10, s4
9487; GFX9-NEXT:    s_mov_b32 s5, s4
9488; GFX9-NEXT:    s_addc_u32 s9, s11, s4
9489; GFX9-NEXT:    v_mov_b32_e32 v2, s16
9490; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
9491; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
9492; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s8
9493; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s9
9494; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
9495; GFX9-NEXT:    v_xor_b32_e32 v5, s1, v0
9496; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v1
9497; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
9498; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
9499; GFX9-NEXT:    s_sub_u32 s0, 0, s8
9500; GFX9-NEXT:    v_mov_b32_e32 v6, s1
9501; GFX9-NEXT:    s_subb_u32 s1, 0, s9
9502; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
9503; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
9504; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
9505; GFX9-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
9506; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
9507; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
9508; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v6, vcc
9509; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
9510; GFX9-NEXT:    v_readfirstlane_b32 s13, v3
9511; GFX9-NEXT:    s_mul_hi_u32 s12, s0, s10
9512; GFX9-NEXT:    s_mul_i32 s14, s0, s13
9513; GFX9-NEXT:    s_mul_i32 s11, s1, s10
9514; GFX9-NEXT:    s_add_i32 s12, s12, s14
9515; GFX9-NEXT:    s_add_i32 s12, s12, s11
9516; GFX9-NEXT:    s_mul_i32 s15, s0, s10
9517; GFX9-NEXT:    s_mul_hi_u32 s11, s10, s12
9518; GFX9-NEXT:    s_mul_i32 s14, s10, s12
9519; GFX9-NEXT:    s_mul_hi_u32 s10, s10, s15
9520; GFX9-NEXT:    s_add_u32 s10, s10, s14
9521; GFX9-NEXT:    s_addc_u32 s11, 0, s11
9522; GFX9-NEXT:    s_mul_hi_u32 s16, s13, s15
9523; GFX9-NEXT:    s_mul_i32 s15, s13, s15
9524; GFX9-NEXT:    s_add_u32 s10, s10, s15
9525; GFX9-NEXT:    s_mul_hi_u32 s14, s13, s12
9526; GFX9-NEXT:    s_addc_u32 s10, s11, s16
9527; GFX9-NEXT:    s_addc_u32 s11, s14, 0
9528; GFX9-NEXT:    s_mul_i32 s12, s13, s12
9529; GFX9-NEXT:    s_add_u32 s10, s10, s12
9530; GFX9-NEXT:    s_addc_u32 s11, 0, s11
9531; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s10, v2
9532; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9533; GFX9-NEXT:    s_addc_u32 s10, s13, s11
9534; GFX9-NEXT:    v_readfirstlane_b32 s12, v2
9535; GFX9-NEXT:    s_mul_i32 s11, s0, s10
9536; GFX9-NEXT:    s_mul_hi_u32 s13, s0, s12
9537; GFX9-NEXT:    s_add_i32 s11, s13, s11
9538; GFX9-NEXT:    s_mul_i32 s1, s1, s12
9539; GFX9-NEXT:    s_add_i32 s11, s11, s1
9540; GFX9-NEXT:    s_mul_i32 s0, s0, s12
9541; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s0
9542; GFX9-NEXT:    s_mul_i32 s14, s10, s0
9543; GFX9-NEXT:    s_mul_i32 s16, s12, s11
9544; GFX9-NEXT:    s_mul_hi_u32 s0, s12, s0
9545; GFX9-NEXT:    s_mul_hi_u32 s15, s12, s11
9546; GFX9-NEXT:    s_add_u32 s0, s0, s16
9547; GFX9-NEXT:    s_addc_u32 s12, 0, s15
9548; GFX9-NEXT:    s_add_u32 s0, s0, s14
9549; GFX9-NEXT:    s_mul_hi_u32 s1, s10, s11
9550; GFX9-NEXT:    s_addc_u32 s0, s12, s13
9551; GFX9-NEXT:    s_addc_u32 s1, s1, 0
9552; GFX9-NEXT:    s_mul_i32 s11, s10, s11
9553; GFX9-NEXT:    s_add_u32 s0, s0, s11
9554; GFX9-NEXT:    s_addc_u32 s1, 0, s1
9555; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
9556; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9557; GFX9-NEXT:    s_addc_u32 s12, s10, s1
9558; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
9559; GFX9-NEXT:    s_add_u32 s0, s6, s10
9560; GFX9-NEXT:    s_mov_b32 s11, s10
9561; GFX9-NEXT:    s_addc_u32 s1, s7, s10
9562; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
9563; GFX9-NEXT:    v_readfirstlane_b32 s13, v2
9564; GFX9-NEXT:    s_mul_i32 s1, s6, s12
9565; GFX9-NEXT:    s_mul_hi_u32 s14, s6, s13
9566; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s12
9567; GFX9-NEXT:    s_add_u32 s1, s14, s1
9568; GFX9-NEXT:    s_addc_u32 s0, 0, s0
9569; GFX9-NEXT:    s_mul_hi_u32 s15, s7, s13
9570; GFX9-NEXT:    s_mul_i32 s13, s7, s13
9571; GFX9-NEXT:    s_add_u32 s1, s1, s13
9572; GFX9-NEXT:    s_mul_hi_u32 s14, s7, s12
9573; GFX9-NEXT:    s_addc_u32 s0, s0, s15
9574; GFX9-NEXT:    s_addc_u32 s1, s14, 0
9575; GFX9-NEXT:    s_mul_i32 s12, s7, s12
9576; GFX9-NEXT:    s_add_u32 s12, s0, s12
9577; GFX9-NEXT:    s_addc_u32 s13, 0, s1
9578; GFX9-NEXT:    s_mul_i32 s0, s8, s13
9579; GFX9-NEXT:    s_mul_hi_u32 s1, s8, s12
9580; GFX9-NEXT:    s_add_i32 s0, s1, s0
9581; GFX9-NEXT:    s_mul_i32 s1, s9, s12
9582; GFX9-NEXT:    s_add_i32 s14, s0, s1
9583; GFX9-NEXT:    s_mul_i32 s1, s8, s12
9584; GFX9-NEXT:    v_mov_b32_e32 v2, s1
9585; GFX9-NEXT:    s_sub_i32 s0, s7, s14
9586; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s6, v2
9587; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9588; GFX9-NEXT:    s_subb_u32 s6, s0, s9
9589; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s8, v2
9590; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
9591; GFX9-NEXT:    s_subb_u32 s6, s6, 0
9592; GFX9-NEXT:    s_cmp_ge_u32 s6, s9
9593; GFX9-NEXT:    s_cselect_b32 s15, -1, 0
9594; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
9595; GFX9-NEXT:    s_cmp_eq_u32 s6, s9
9596; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
9597; GFX9-NEXT:    v_mov_b32_e32 v5, s15
9598; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
9599; GFX9-NEXT:    s_add_u32 s6, s12, 2
9600; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[0:1]
9601; GFX9-NEXT:    s_addc_u32 s0, s13, 0
9602; GFX9-NEXT:    s_add_u32 s15, s12, 1
9603; GFX9-NEXT:    s_addc_u32 s1, s13, 0
9604; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9605; GFX9-NEXT:    s_subb_u32 s7, s7, s14
9606; GFX9-NEXT:    s_cmp_ge_u32 s7, s9
9607; GFX9-NEXT:    v_mov_b32_e32 v5, s1
9608; GFX9-NEXT:    v_mov_b32_e32 v6, s0
9609; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
9610; GFX9-NEXT:    s_cselect_b32 s14, -1, 0
9611; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
9612; GFX9-NEXT:    s_cmp_eq_u32 s7, s9
9613; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
9614; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
9615; GFX9-NEXT:    v_mov_b32_e32 v5, s14
9616; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
9617; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
9618; GFX9-NEXT:    v_mov_b32_e32 v5, s13
9619; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
9620; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
9621; GFX9-NEXT:    v_mov_b32_e32 v3, s15
9622; GFX9-NEXT:    v_mov_b32_e32 v5, s6
9623; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
9624; GFX9-NEXT:    v_mov_b32_e32 v5, s12
9625; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
9626; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
9627; GFX9-NEXT:    v_xor_b32_e32 v3, s0, v3
9628; GFX9-NEXT:    v_xor_b32_e32 v5, s1, v2
9629; GFX9-NEXT:    v_mov_b32_e32 v6, s1
9630; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v3
9631; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
9632; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9633; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
9634; GFX9-NEXT:    s_endpgm
9635  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
9636  %r = sdiv <2 x i64> %x, %shl.y
9637  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
9638  ret void
9639}
9640
9641define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
9642; CHECK-LABEL: @srem_i64_oddk_denom(
9643; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
9644; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9645; CHECK-NEXT:    ret void
9646;
9647; GFX6-LABEL: srem_i64_oddk_denom:
9648; GFX6:       ; %bb.0:
9649; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
9650; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
9651; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9652; GFX6-NEXT:    s_mov_b32 s4, 0xffed2705
9653; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
9654; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9655; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9656; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9657; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9658; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9659; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9660; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9661; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9662; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
9663; GFX6-NEXT:    s_add_u32 s2, s2, s8
9664; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s4
9665; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s4
9666; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s4
9667; GFX6-NEXT:    s_mov_b32 s9, s8
9668; GFX6-NEXT:    s_addc_u32 s3, s3, s8
9669; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9670; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
9671; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
9672; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
9673; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
9674; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
9675; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9676; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
9677; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
9678; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
9679; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
9680; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
9681; GFX6-NEXT:    s_mov_b32 s5, s1
9682; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
9683; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
9684; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
9685; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9686; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9687; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9688; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9689; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s4
9690; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s4
9691; GFX6-NEXT:    s_mov_b32 s6, -1
9692; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9693; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s4
9694; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
9695; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
9696; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
9697; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
9698; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
9699; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
9700; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
9701; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
9702; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
9703; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9704; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
9705; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
9706; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
9707; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9708; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9709; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9710; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9711; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
9712; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
9713; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
9714; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
9715; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
9716; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9717; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9718; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
9719; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
9720; GFX6-NEXT:    s_mov_b32 s4, s0
9721; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fb
9722; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9723; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
9724; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
9725; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
9726; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
9727; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
9728; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s0
9729; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s0
9730; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
9731; GFX6-NEXT:    v_mov_b32_e32 v2, s3
9732; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
9733; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
9734; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v0
9735; GFX6-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
9736; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s0, v2
9737; GFX6-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
9738; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
9739; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
9740; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9741; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
9742; GFX6-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
9743; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
9744; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
9745; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
9746; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
9747; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
9748; GFX6-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
9749; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
9750; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
9751; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
9752; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
9753; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
9754; GFX6-NEXT:    v_xor_b32_e32 v1, s8, v1
9755; GFX6-NEXT:    v_mov_b32_e32 v2, s8
9756; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
9757; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
9758; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
9759; GFX6-NEXT:    s_endpgm
9760;
9761; GFX9-LABEL: srem_i64_oddk_denom:
9762; GFX9:       ; %bb.0:
9763; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4996c7d8
9764; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
9765; GFX9-NEXT:    v_mac_f32_e32 v0, 0, v1
9766; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
9767; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
9768; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9769; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9770; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9771; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
9772; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9773; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9774; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
9775; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
9776; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
9777; GFX9-NEXT:    s_mul_hi_u32 s2, s1, 0xffed2705
9778; GFX9-NEXT:    s_mul_i32 s3, s0, 0xffed2705
9779; GFX9-NEXT:    s_add_i32 s2, s2, s3
9780; GFX9-NEXT:    s_sub_i32 s2, s2, s1
9781; GFX9-NEXT:    s_mul_i32 s9, s1, 0xffed2705
9782; GFX9-NEXT:    s_mul_hi_u32 s3, s1, s2
9783; GFX9-NEXT:    s_mul_i32 s8, s1, s2
9784; GFX9-NEXT:    s_mul_hi_u32 s1, s1, s9
9785; GFX9-NEXT:    s_add_u32 s1, s1, s8
9786; GFX9-NEXT:    s_addc_u32 s3, 0, s3
9787; GFX9-NEXT:    s_mul_hi_u32 s10, s0, s9
9788; GFX9-NEXT:    s_mul_i32 s9, s0, s9
9789; GFX9-NEXT:    s_add_u32 s1, s1, s9
9790; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
9791; GFX9-NEXT:    s_addc_u32 s1, s3, s10
9792; GFX9-NEXT:    s_addc_u32 s3, s8, 0
9793; GFX9-NEXT:    s_mul_i32 s2, s0, s2
9794; GFX9-NEXT:    s_add_u32 s1, s1, s2
9795; GFX9-NEXT:    s_addc_u32 s2, 0, s3
9796; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v0
9797; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9798; GFX9-NEXT:    s_addc_u32 s0, s0, s2
9799; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
9800; GFX9-NEXT:    s_mul_i32 s1, s0, 0xffed2705
9801; GFX9-NEXT:    s_mul_hi_u32 s3, s2, 0xffed2705
9802; GFX9-NEXT:    s_add_i32 s3, s3, s1
9803; GFX9-NEXT:    s_sub_i32 s1, s3, s2
9804; GFX9-NEXT:    s_mul_i32 s8, s2, 0xffed2705
9805; GFX9-NEXT:    s_mul_hi_u32 s11, s2, s1
9806; GFX9-NEXT:    s_mul_i32 s12, s2, s1
9807; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s8
9808; GFX9-NEXT:    s_add_u32 s2, s2, s12
9809; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s8
9810; GFX9-NEXT:    s_mul_i32 s10, s0, s8
9811; GFX9-NEXT:    s_addc_u32 s8, 0, s11
9812; GFX9-NEXT:    s_add_u32 s2, s2, s10
9813; GFX9-NEXT:    s_mul_hi_u32 s3, s0, s1
9814; GFX9-NEXT:    s_addc_u32 s2, s8, s9
9815; GFX9-NEXT:    s_addc_u32 s3, s3, 0
9816; GFX9-NEXT:    s_mul_i32 s1, s0, s1
9817; GFX9-NEXT:    s_add_u32 s1, s2, s1
9818; GFX9-NEXT:    s_addc_u32 s2, 0, s3
9819; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v0
9820; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9821; GFX9-NEXT:    s_addc_u32 s8, s0, s2
9822; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9823; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
9824; GFX9-NEXT:    s_add_u32 s0, s6, s2
9825; GFX9-NEXT:    s_mov_b32 s3, s2
9826; GFX9-NEXT:    s_addc_u32 s1, s7, s2
9827; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
9828; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
9829; GFX9-NEXT:    s_mul_i32 s6, s0, s8
9830; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s7
9831; GFX9-NEXT:    s_mul_hi_u32 s3, s0, s8
9832; GFX9-NEXT:    s_add_u32 s6, s9, s6
9833; GFX9-NEXT:    s_addc_u32 s3, 0, s3
9834; GFX9-NEXT:    s_mul_hi_u32 s10, s1, s7
9835; GFX9-NEXT:    s_mul_i32 s7, s1, s7
9836; GFX9-NEXT:    s_add_u32 s6, s6, s7
9837; GFX9-NEXT:    s_mul_hi_u32 s9, s1, s8
9838; GFX9-NEXT:    s_addc_u32 s3, s3, s10
9839; GFX9-NEXT:    s_addc_u32 s6, s9, 0
9840; GFX9-NEXT:    s_mul_i32 s7, s1, s8
9841; GFX9-NEXT:    s_add_u32 s3, s3, s7
9842; GFX9-NEXT:    s_addc_u32 s6, 0, s6
9843; GFX9-NEXT:    s_mul_hi_u32 s8, s3, 0x12d8fb
9844; GFX9-NEXT:    s_mul_i32 s3, s3, 0x12d8fb
9845; GFX9-NEXT:    s_mul_i32 s6, s6, 0x12d8fb
9846; GFX9-NEXT:    v_mov_b32_e32 v0, s3
9847; GFX9-NEXT:    s_add_i32 s8, s8, s6
9848; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
9849; GFX9-NEXT:    s_mov_b32 s7, 0x12d8fb
9850; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9851; GFX9-NEXT:    s_subb_u32 s3, s1, s8
9852; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s7, v0
9853; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9854; GFX9-NEXT:    s_subb_u32 s0, s3, 0
9855; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s7, v1
9856; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9857; GFX9-NEXT:    s_subb_u32 s1, s0, 0
9858; GFX9-NEXT:    s_mov_b32 s6, 0x12d8fa
9859; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
9860; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
9861; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
9862; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
9863; GFX9-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
9864; GFX9-NEXT:    v_mov_b32_e32 v5, s0
9865; GFX9-NEXT:    v_mov_b32_e32 v6, s1
9866; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
9867; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s6, v0
9868; GFX9-NEXT:    s_cmp_eq_u32 s3, 0
9869; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
9870; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
9871; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
9872; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
9873; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
9874; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
9875; GFX9-NEXT:    v_mov_b32_e32 v6, s3
9876; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
9877; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
9878; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
9879; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v4
9880; GFX9-NEXT:    v_mov_b32_e32 v3, s2
9881; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
9882; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
9883; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
9884; GFX9-NEXT:    s_endpgm
9885  %r = srem i64 %x, 1235195
9886  store i64 %r, i64 addrspace(1)* %out
9887  ret void
9888}
9889
9890define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
9891; CHECK-LABEL: @srem_i64_pow2k_denom(
9892; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
9893; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9894; CHECK-NEXT:    ret void
9895;
9896; GFX6-LABEL: srem_i64_pow2k_denom:
9897; GFX6:       ; %bb.0:
9898; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
9899; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9900; GFX6-NEXT:    s_mov_b32 s6, -1
9901; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9902; GFX6-NEXT:    s_mov_b32 s4, s0
9903; GFX6-NEXT:    s_ashr_i32 s0, s3, 31
9904; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
9905; GFX6-NEXT:    s_add_u32 s0, s2, s0
9906; GFX6-NEXT:    s_mov_b32 s5, s1
9907; GFX6-NEXT:    s_addc_u32 s1, s3, 0
9908; GFX6-NEXT:    s_and_b32 s0, s0, 0xfffff000
9909; GFX6-NEXT:    s_sub_u32 s0, s2, s0
9910; GFX6-NEXT:    s_subb_u32 s1, s3, s1
9911; GFX6-NEXT:    v_mov_b32_e32 v0, s0
9912; GFX6-NEXT:    v_mov_b32_e32 v1, s1
9913; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
9914; GFX6-NEXT:    s_endpgm
9915;
9916; GFX9-LABEL: srem_i64_pow2k_denom:
9917; GFX9:       ; %bb.0:
9918; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
9919; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9920; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9921; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
9922; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
9923; GFX9-NEXT:    s_add_u32 s4, s2, s4
9924; GFX9-NEXT:    s_addc_u32 s5, s3, 0
9925; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
9926; GFX9-NEXT:    s_sub_u32 s2, s2, s4
9927; GFX9-NEXT:    s_subb_u32 s3, s3, s5
9928; GFX9-NEXT:    v_mov_b32_e32 v0, s2
9929; GFX9-NEXT:    v_mov_b32_e32 v1, s3
9930; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
9931; GFX9-NEXT:    s_endpgm
9932  %r = srem i64 %x, 4096
9933  store i64 %r, i64 addrspace(1)* %out
9934  ret void
9935}
9936
9937define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
9938; CHECK-LABEL: @srem_i64_pow2_shl_denom(
9939; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
9940; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
9941; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9942; CHECK-NEXT:    ret void
9943;
9944; GFX6-LABEL: srem_i64_pow2_shl_denom:
9945; GFX6:       ; %bb.0:
9946; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
9947; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
9948; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9949; GFX6-NEXT:    s_mov_b32 s6, -1
9950; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9951; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
9952; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
9953; GFX6-NEXT:    s_add_u32 s2, s2, s4
9954; GFX6-NEXT:    s_mov_b32 s5, s4
9955; GFX6-NEXT:    s_addc_u32 s3, s3, s4
9956; GFX6-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
9957; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
9958; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
9959; GFX6-NEXT:    s_sub_u32 s4, 0, s8
9960; GFX6-NEXT:    s_subb_u32 s5, 0, s9
9961; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
9962; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
9963; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9964; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9965; GFX6-NEXT:    s_ashr_i32 s10, s3, 31
9966; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9967; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9968; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9969; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9970; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9971; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9972; GFX6-NEXT:    s_add_u32 s2, s2, s10
9973; GFX6-NEXT:    s_mov_b32 s11, s10
9974; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
9975; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
9976; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
9977; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
9978; GFX6-NEXT:    s_addc_u32 s3, s3, s10
9979; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9980; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
9981; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
9982; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
9983; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
9984; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
9985; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9986; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
9987; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
9988; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
9989; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
9990; GFX6-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
9991; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
9992; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
9993; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
9994; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9995; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9996; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9997; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9998; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
9999; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
10000; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
10001; GFX6-NEXT:    s_mov_b32 s5, s1
10002; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10003; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
10004; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
10005; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
10006; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
10007; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
10008; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
10009; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
10010; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
10011; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
10012; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
10013; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
10014; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
10015; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
10016; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
10017; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10018; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
10019; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
10020; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
10021; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v1
10022; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v0
10023; GFX6-NEXT:    v_mul_hi_u32 v4, s12, v1
10024; GFX6-NEXT:    v_mul_hi_u32 v5, s13, v1
10025; GFX6-NEXT:    v_mul_lo_u32 v1, s13, v1
10026; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10027; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
10028; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v0
10029; GFX6-NEXT:    v_mul_hi_u32 v0, s13, v0
10030; GFX6-NEXT:    s_mov_b32 s4, s0
10031; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
10032; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
10033; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
10034; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
10035; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
10036; GFX6-NEXT:    v_mul_lo_u32 v1, s8, v1
10037; GFX6-NEXT:    v_mul_hi_u32 v2, s8, v0
10038; GFX6-NEXT:    v_mul_lo_u32 v3, s9, v0
10039; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
10040; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
10041; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
10042; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s13, v1
10043; GFX6-NEXT:    v_mov_b32_e32 v3, s9
10044; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s12, v0
10045; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
10046; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s8, v0
10047; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
10048; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v5
10049; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
10050; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v4
10051; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
10052; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
10053; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v5
10054; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s8, v4
10055; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
10056; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
10057; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
10058; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
10059; GFX6-NEXT:    v_mov_b32_e32 v5, s13
10060; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
10061; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
10062; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
10063; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
10064; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
10065; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
10066; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
10067; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
10068; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
10069; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
10070; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
10071; GFX6-NEXT:    v_xor_b32_e32 v0, s10, v0
10072; GFX6-NEXT:    v_xor_b32_e32 v1, s10, v1
10073; GFX6-NEXT:    v_mov_b32_e32 v2, s10
10074; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s10, v0
10075; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
10076; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
10077; GFX6-NEXT:    s_endpgm
10078;
10079; GFX9-LABEL: srem_i64_pow2_shl_denom:
10080; GFX9:       ; %bb.0:
10081; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
10082; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
10083; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10084; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
10085; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
10086; GFX9-NEXT:    s_add_u32 s2, s2, s4
10087; GFX9-NEXT:    s_mov_b32 s5, s4
10088; GFX9-NEXT:    s_addc_u32 s3, s3, s4
10089; GFX9-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
10090; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
10091; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
10092; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
10093; GFX9-NEXT:    s_sub_u32 s0, 0, s8
10094; GFX9-NEXT:    s_subb_u32 s1, 0, s9
10095; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
10096; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
10097; GFX9-NEXT:    v_mov_b32_e32 v0, 0
10098; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
10099; GFX9-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
10100; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
10101; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
10102; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
10103; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
10104; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
10105; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
10106; GFX9-NEXT:    s_mul_i32 s10, s0, s2
10107; GFX9-NEXT:    s_mul_hi_u32 s12, s0, s3
10108; GFX9-NEXT:    s_mul_i32 s11, s1, s3
10109; GFX9-NEXT:    s_add_i32 s10, s12, s10
10110; GFX9-NEXT:    s_add_i32 s10, s10, s11
10111; GFX9-NEXT:    s_mul_i32 s13, s0, s3
10112; GFX9-NEXT:    s_mul_hi_u32 s11, s3, s10
10113; GFX9-NEXT:    s_mul_i32 s12, s3, s10
10114; GFX9-NEXT:    s_mul_hi_u32 s3, s3, s13
10115; GFX9-NEXT:    s_add_u32 s3, s3, s12
10116; GFX9-NEXT:    s_addc_u32 s11, 0, s11
10117; GFX9-NEXT:    s_mul_hi_u32 s14, s2, s13
10118; GFX9-NEXT:    s_mul_i32 s13, s2, s13
10119; GFX9-NEXT:    s_add_u32 s3, s3, s13
10120; GFX9-NEXT:    s_mul_hi_u32 s12, s2, s10
10121; GFX9-NEXT:    s_addc_u32 s3, s11, s14
10122; GFX9-NEXT:    s_addc_u32 s11, s12, 0
10123; GFX9-NEXT:    s_mul_i32 s10, s2, s10
10124; GFX9-NEXT:    s_add_u32 s3, s3, s10
10125; GFX9-NEXT:    s_addc_u32 s10, 0, s11
10126; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s3, v1
10127; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
10128; GFX9-NEXT:    s_addc_u32 s2, s2, s10
10129; GFX9-NEXT:    v_readfirstlane_b32 s10, v1
10130; GFX9-NEXT:    s_mul_i32 s3, s0, s2
10131; GFX9-NEXT:    s_mul_hi_u32 s11, s0, s10
10132; GFX9-NEXT:    s_add_i32 s3, s11, s3
10133; GFX9-NEXT:    s_mul_i32 s1, s1, s10
10134; GFX9-NEXT:    s_add_i32 s3, s3, s1
10135; GFX9-NEXT:    s_mul_i32 s0, s0, s10
10136; GFX9-NEXT:    s_mul_hi_u32 s11, s2, s0
10137; GFX9-NEXT:    s_mul_i32 s12, s2, s0
10138; GFX9-NEXT:    s_mul_i32 s14, s10, s3
10139; GFX9-NEXT:    s_mul_hi_u32 s0, s10, s0
10140; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s3
10141; GFX9-NEXT:    s_add_u32 s0, s0, s14
10142; GFX9-NEXT:    s_addc_u32 s10, 0, s13
10143; GFX9-NEXT:    s_add_u32 s0, s0, s12
10144; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
10145; GFX9-NEXT:    s_addc_u32 s0, s10, s11
10146; GFX9-NEXT:    s_addc_u32 s1, s1, 0
10147; GFX9-NEXT:    s_mul_i32 s3, s2, s3
10148; GFX9-NEXT:    s_add_u32 s0, s0, s3
10149; GFX9-NEXT:    s_addc_u32 s1, 0, s1
10150; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
10151; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
10152; GFX9-NEXT:    s_addc_u32 s2, s2, s1
10153; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10154; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
10155; GFX9-NEXT:    s_add_u32 s0, s6, s10
10156; GFX9-NEXT:    s_mov_b32 s11, s10
10157; GFX9-NEXT:    s_addc_u32 s1, s7, s10
10158; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
10159; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
10160; GFX9-NEXT:    s_mul_i32 s1, s6, s2
10161; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s3
10162; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s2
10163; GFX9-NEXT:    s_add_u32 s1, s11, s1
10164; GFX9-NEXT:    s_addc_u32 s0, 0, s0
10165; GFX9-NEXT:    s_mul_hi_u32 s12, s7, s3
10166; GFX9-NEXT:    s_mul_i32 s3, s7, s3
10167; GFX9-NEXT:    s_add_u32 s1, s1, s3
10168; GFX9-NEXT:    s_mul_hi_u32 s11, s7, s2
10169; GFX9-NEXT:    s_addc_u32 s0, s0, s12
10170; GFX9-NEXT:    s_addc_u32 s1, s11, 0
10171; GFX9-NEXT:    s_mul_i32 s2, s7, s2
10172; GFX9-NEXT:    s_add_u32 s0, s0, s2
10173; GFX9-NEXT:    s_addc_u32 s1, 0, s1
10174; GFX9-NEXT:    s_mul_i32 s1, s8, s1
10175; GFX9-NEXT:    s_mul_hi_u32 s2, s8, s0
10176; GFX9-NEXT:    s_add_i32 s1, s2, s1
10177; GFX9-NEXT:    s_mul_i32 s2, s9, s0
10178; GFX9-NEXT:    s_mul_i32 s0, s8, s0
10179; GFX9-NEXT:    s_add_i32 s11, s1, s2
10180; GFX9-NEXT:    v_mov_b32_e32 v1, s0
10181; GFX9-NEXT:    s_sub_i32 s1, s7, s11
10182; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s6, v1
10183; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
10184; GFX9-NEXT:    s_subb_u32 s6, s1, s9
10185; GFX9-NEXT:    v_subrev_co_u32_e64 v2, s[0:1], s8, v1
10186; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
10187; GFX9-NEXT:    s_subb_u32 s12, s6, 0
10188; GFX9-NEXT:    s_cmp_ge_u32 s12, s9
10189; GFX9-NEXT:    s_cselect_b32 s13, -1, 0
10190; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v2
10191; GFX9-NEXT:    s_cmp_eq_u32 s12, s9
10192; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[2:3]
10193; GFX9-NEXT:    v_mov_b32_e32 v4, s13
10194; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
10195; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
10196; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[2:3]
10197; GFX9-NEXT:    s_subb_u32 s2, s6, s9
10198; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v2
10199; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
10200; GFX9-NEXT:    s_subb_u32 s0, s2, 0
10201; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
10202; GFX9-NEXT:    s_subb_u32 s2, s7, s11
10203; GFX9-NEXT:    s_cmp_ge_u32 s2, s9
10204; GFX9-NEXT:    v_mov_b32_e32 v5, s12
10205; GFX9-NEXT:    v_mov_b32_e32 v6, s0
10206; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
10207; GFX9-NEXT:    s_cselect_b32 s3, -1, 0
10208; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
10209; GFX9-NEXT:    s_cmp_eq_u32 s2, s9
10210; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
10211; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
10212; GFX9-NEXT:    v_mov_b32_e32 v6, s3
10213; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
10214; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
10215; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
10216; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
10217; GFX9-NEXT:    v_mov_b32_e32 v6, s2
10218; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
10219; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
10220; GFX9-NEXT:    v_xor_b32_e32 v1, s10, v1
10221; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v3
10222; GFX9-NEXT:    v_mov_b32_e32 v3, s10
10223; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s10, v1
10224; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
10225; GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[4:5]
10226; GFX9-NEXT:    s_endpgm
10227  %shl.y = shl i64 4096, %y
10228  %r = srem i64 %x, %shl.y
10229  store i64 %r, i64 addrspace(1)* %out
10230  ret void
10231}
10232
10233define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
10234; CHECK-LABEL: @srem_v2i64_pow2k_denom(
10235; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
10236; CHECK-NEXT:    [[TMP2:%.*]] = srem i64 [[TMP1]], 4096
10237; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
10238; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
10239; CHECK-NEXT:    [[TMP5:%.*]] = srem i64 [[TMP4]], 4096
10240; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
10241; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
10242; CHECK-NEXT:    ret void
10243;
10244; GFX6-LABEL: srem_v2i64_pow2k_denom:
10245; GFX6:       ; %bb.0:
10246; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
10247; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
10248; GFX6-NEXT:    s_mov_b32 s3, 0xf000
10249; GFX6-NEXT:    s_mov_b32 s2, -1
10250; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10251; GFX6-NEXT:    s_ashr_i32 s8, s5, 31
10252; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
10253; GFX6-NEXT:    s_add_u32 s8, s4, s8
10254; GFX6-NEXT:    s_addc_u32 s9, s5, 0
10255; GFX6-NEXT:    s_and_b32 s8, s8, 0xfffff000
10256; GFX6-NEXT:    s_sub_u32 s4, s4, s8
10257; GFX6-NEXT:    s_subb_u32 s5, s5, s9
10258; GFX6-NEXT:    s_ashr_i32 s8, s7, 31
10259; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
10260; GFX6-NEXT:    s_add_u32 s8, s6, s8
10261; GFX6-NEXT:    s_addc_u32 s9, s7, 0
10262; GFX6-NEXT:    s_and_b32 s8, s8, 0xfffff000
10263; GFX6-NEXT:    s_sub_u32 s6, s6, s8
10264; GFX6-NEXT:    s_subb_u32 s7, s7, s9
10265; GFX6-NEXT:    v_mov_b32_e32 v0, s4
10266; GFX6-NEXT:    v_mov_b32_e32 v1, s5
10267; GFX6-NEXT:    v_mov_b32_e32 v2, s6
10268; GFX6-NEXT:    v_mov_b32_e32 v3, s7
10269; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
10270; GFX6-NEXT:    s_endpgm
10271;
10272; GFX9-LABEL: srem_v2i64_pow2k_denom:
10273; GFX9:       ; %bb.0:
10274; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
10275; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
10276; GFX9-NEXT:    v_mov_b32_e32 v4, 0
10277; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10278; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
10279; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
10280; GFX9-NEXT:    s_add_u32 s0, s4, s0
10281; GFX9-NEXT:    s_addc_u32 s1, s5, 0
10282; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
10283; GFX9-NEXT:    s_sub_u32 s0, s4, s0
10284; GFX9-NEXT:    s_subb_u32 s1, s5, s1
10285; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
10286; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
10287; GFX9-NEXT:    s_add_u32 s4, s6, s4
10288; GFX9-NEXT:    s_addc_u32 s5, s7, 0
10289; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
10290; GFX9-NEXT:    s_sub_u32 s4, s6, s4
10291; GFX9-NEXT:    s_subb_u32 s5, s7, s5
10292; GFX9-NEXT:    v_mov_b32_e32 v0, s0
10293; GFX9-NEXT:    v_mov_b32_e32 v1, s1
10294; GFX9-NEXT:    v_mov_b32_e32 v2, s4
10295; GFX9-NEXT:    v_mov_b32_e32 v3, s5
10296; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
10297; GFX9-NEXT:    s_endpgm
10298  %r = srem <2 x i64> %x, <i64 4096, i64 4096>
10299  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
10300  ret void
10301}
10302
10303define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
10304; CHECK-LABEL: @srem_v2i64_pow2_shl_denom(
10305; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
10306; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
10307; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
10308; CHECK-NEXT:    [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]]
10309; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
10310; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
10311; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
10312; CHECK-NEXT:    [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]]
10313; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
10314; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
10315; CHECK-NEXT:    ret void
10316;
10317; GFX6-LABEL: srem_v2i64_pow2_shl_denom:
10318; GFX6:       ; %bb.0:
10319; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
10320; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
10321; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10322; GFX6-NEXT:    s_mov_b32 s11, 0xf000
10323; GFX6-NEXT:    s_lshl_b64 s[14:15], s[2:3], s10
10324; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
10325; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
10326; GFX6-NEXT:    s_add_u32 s2, s2, s8
10327; GFX6-NEXT:    s_mov_b32 s9, s8
10328; GFX6-NEXT:    s_addc_u32 s3, s3, s8
10329; GFX6-NEXT:    s_xor_b64 s[16:17], s[2:3], s[8:9]
10330; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s16
10331; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s17
10332; GFX6-NEXT:    s_sub_u32 s2, 0, s16
10333; GFX6-NEXT:    s_subb_u32 s3, 0, s17
10334; GFX6-NEXT:    s_ashr_i32 s12, s5, 31
10335; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
10336; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
10337; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
10338; GFX6-NEXT:    s_add_u32 s0, s4, s12
10339; GFX6-NEXT:    s_mov_b32 s13, s12
10340; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
10341; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
10342; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
10343; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
10344; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
10345; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
10346; GFX6-NEXT:    s_addc_u32 s1, s5, s12
10347; GFX6-NEXT:    s_xor_b64 s[4:5], s[0:1], s[12:13]
10348; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
10349; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
10350; GFX6-NEXT:    v_mul_lo_u32 v5, s3, v0
10351; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v0
10352; GFX6-NEXT:    s_mov_b32 s10, -1
10353; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
10354; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
10355; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
10356; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
10357; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
10358; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
10359; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
10360; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
10361; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
10362; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
10363; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
10364; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
10365; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
10366; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
10367; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10368; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
10369; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
10370; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
10371; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
10372; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
10373; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
10374; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10375; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v0
10376; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
10377; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
10378; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
10379; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
10380; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
10381; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
10382; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
10383; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
10384; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
10385; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
10386; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
10387; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
10388; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
10389; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10390; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
10391; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
10392; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
10393; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
10394; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
10395; GFX6-NEXT:    v_mul_hi_u32 v4, s4, v1
10396; GFX6-NEXT:    v_mul_hi_u32 v5, s5, v1
10397; GFX6-NEXT:    v_mul_lo_u32 v1, s5, v1
10398; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10399; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
10400; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
10401; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
10402; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
10403; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
10404; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
10405; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
10406; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
10407; GFX6-NEXT:    v_mul_lo_u32 v1, s16, v1
10408; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v0
10409; GFX6-NEXT:    v_mul_lo_u32 v3, s17, v0
10410; GFX6-NEXT:    v_mul_lo_u32 v0, s16, v0
10411; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
10412; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
10413; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v1
10414; GFX6-NEXT:    v_mov_b32_e32 v3, s17
10415; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
10416; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
10417; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s16, v0
10418; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
10419; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s17, v5
10420; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
10421; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v4
10422; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
10423; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
10424; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s17, v5
10425; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s16, v4
10426; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
10427; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
10428; GFX6-NEXT:    s_ashr_i32 s2, s15, 31
10429; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
10430; GFX6-NEXT:    s_add_u32 s4, s14, s2
10431; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
10432; GFX6-NEXT:    v_mov_b32_e32 v5, s5
10433; GFX6-NEXT:    s_mov_b32 s3, s2
10434; GFX6-NEXT:    s_addc_u32 s5, s15, s2
10435; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[2:3]
10436; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
10437; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s5
10438; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
10439; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
10440; GFX6-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
10441; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
10442; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
10443; GFX6-NEXT:    v_rcp_f32_e32 v6, v6
10444; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
10445; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v1
10446; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
10447; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
10448; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
10449; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
10450; GFX6-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v6
10451; GFX6-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
10452; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
10453; GFX6-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
10454; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
10455; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
10456; GFX6-NEXT:    s_sub_u32 s0, 0, s4
10457; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
10458; GFX6-NEXT:    v_mul_hi_u32 v2, s0, v3
10459; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
10460; GFX6-NEXT:    s_subb_u32 s1, 0, s5
10461; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v3
10462; GFX6-NEXT:    s_ashr_i32 s14, s7, 31
10463; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
10464; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
10465; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
10466; GFX6-NEXT:    v_mul_lo_u32 v6, v3, v2
10467; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
10468; GFX6-NEXT:    v_mul_hi_u32 v8, v3, v2
10469; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v2
10470; GFX6-NEXT:    v_mul_lo_u32 v2, v4, v2
10471; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
10472; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
10473; GFX6-NEXT:    v_mul_lo_u32 v8, v4, v5
10474; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
10475; GFX6-NEXT:    s_mov_b32 s15, s14
10476; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
10477; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
10478; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
10479; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
10480; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
10481; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
10482; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10483; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
10484; GFX6-NEXT:    v_mul_lo_u32 v4, s0, v3
10485; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v2
10486; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v2
10487; GFX6-NEXT:    v_xor_b32_e32 v1, s12, v1
10488; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
10489; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v2
10490; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
10491; GFX6-NEXT:    v_mul_lo_u32 v8, v2, v4
10492; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
10493; GFX6-NEXT:    v_mul_hi_u32 v10, v2, v4
10494; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
10495; GFX6-NEXT:    v_mul_lo_u32 v5, v3, v5
10496; GFX6-NEXT:    v_mul_hi_u32 v6, v3, v4
10497; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
10498; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
10499; GFX6-NEXT:    v_mul_lo_u32 v4, v3, v4
10500; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
10501; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
10502; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
10503; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
10504; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
10505; GFX6-NEXT:    s_add_u32 s0, s6, s14
10506; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
10507; GFX6-NEXT:    s_addc_u32 s1, s7, s14
10508; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
10509; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[14:15]
10510; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v3
10511; GFX6-NEXT:    v_mul_hi_u32 v5, s6, v2
10512; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v3
10513; GFX6-NEXT:    v_mul_hi_u32 v8, s7, v3
10514; GFX6-NEXT:    v_mul_lo_u32 v3, s7, v3
10515; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
10516; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
10517; GFX6-NEXT:    v_mul_lo_u32 v7, s7, v2
10518; GFX6-NEXT:    v_mul_hi_u32 v2, s7, v2
10519; GFX6-NEXT:    v_mov_b32_e32 v6, s12
10520; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
10521; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
10522; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
10523; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
10524; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
10525; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v3
10526; GFX6-NEXT:    v_mul_hi_u32 v4, s4, v2
10527; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v2
10528; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
10529; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v2
10530; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
10531; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
10532; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
10533; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s7, v3
10534; GFX6-NEXT:    v_mov_b32_e32 v5, s5
10535; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
10536; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
10537; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s4, v2
10538; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
10539; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v7
10540; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
10541; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s4, v6
10542; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
10543; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
10544; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s5, v7
10545; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s4, v6
10546; GFX6-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
10547; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
10548; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
10549; GFX6-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
10550; GFX6-NEXT:    v_mov_b32_e32 v7, s7
10551; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
10552; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
10553; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
10554; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
10555; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
10556; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v3
10557; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
10558; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
10559; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
10560; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
10561; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
10562; GFX6-NEXT:    v_xor_b32_e32 v2, s14, v2
10563; GFX6-NEXT:    v_xor_b32_e32 v3, s14, v3
10564; GFX6-NEXT:    v_mov_b32_e32 v4, s14
10565; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v2
10566; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
10567; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10568; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
10569; GFX6-NEXT:    s_endpgm
10570;
10571; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
10572; GFX9:       ; %bb.0:
10573; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
10574; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
10575; GFX9-NEXT:    v_mov_b32_e32 v4, 0
10576; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10577; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
10578; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
10579; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
10580; GFX9-NEXT:    s_add_u32 s2, s2, s8
10581; GFX9-NEXT:    s_mov_b32 s9, s8
10582; GFX9-NEXT:    s_addc_u32 s3, s3, s8
10583; GFX9-NEXT:    s_xor_b64 s[12:13], s[2:3], s[8:9]
10584; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
10585; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
10586; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
10587; GFX9-NEXT:    s_sub_u32 s0, 0, s12
10588; GFX9-NEXT:    s_subb_u32 s1, 0, s13
10589; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
10590; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
10591; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
10592; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
10593; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
10594; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
10595; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
10596; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
10597; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
10598; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
10599; GFX9-NEXT:    s_mul_i32 s14, s0, s2
10600; GFX9-NEXT:    s_mul_hi_u32 s16, s0, s3
10601; GFX9-NEXT:    s_mul_i32 s15, s1, s3
10602; GFX9-NEXT:    s_add_i32 s14, s16, s14
10603; GFX9-NEXT:    s_add_i32 s14, s14, s15
10604; GFX9-NEXT:    s_mul_i32 s17, s0, s3
10605; GFX9-NEXT:    s_mul_hi_u32 s15, s3, s14
10606; GFX9-NEXT:    s_mul_i32 s16, s3, s14
10607; GFX9-NEXT:    s_mul_hi_u32 s3, s3, s17
10608; GFX9-NEXT:    s_add_u32 s3, s3, s16
10609; GFX9-NEXT:    s_addc_u32 s15, 0, s15
10610; GFX9-NEXT:    s_mul_hi_u32 s18, s2, s17
10611; GFX9-NEXT:    s_mul_i32 s17, s2, s17
10612; GFX9-NEXT:    s_add_u32 s3, s3, s17
10613; GFX9-NEXT:    s_mul_hi_u32 s16, s2, s14
10614; GFX9-NEXT:    s_addc_u32 s3, s15, s18
10615; GFX9-NEXT:    s_addc_u32 s15, s16, 0
10616; GFX9-NEXT:    s_mul_i32 s14, s2, s14
10617; GFX9-NEXT:    s_add_u32 s3, s3, s14
10618; GFX9-NEXT:    s_addc_u32 s14, 0, s15
10619; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s3, v0
10620; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
10621; GFX9-NEXT:    s_addc_u32 s2, s2, s14
10622; GFX9-NEXT:    v_readfirstlane_b32 s14, v0
10623; GFX9-NEXT:    s_mul_i32 s3, s0, s2
10624; GFX9-NEXT:    s_mul_hi_u32 s15, s0, s14
10625; GFX9-NEXT:    s_add_i32 s3, s15, s3
10626; GFX9-NEXT:    s_mul_i32 s1, s1, s14
10627; GFX9-NEXT:    s_add_i32 s3, s3, s1
10628; GFX9-NEXT:    s_mul_i32 s0, s0, s14
10629; GFX9-NEXT:    s_mul_hi_u32 s15, s2, s0
10630; GFX9-NEXT:    s_mul_i32 s16, s2, s0
10631; GFX9-NEXT:    s_mul_i32 s18, s14, s3
10632; GFX9-NEXT:    s_mul_hi_u32 s0, s14, s0
10633; GFX9-NEXT:    s_mul_hi_u32 s17, s14, s3
10634; GFX9-NEXT:    s_add_u32 s0, s0, s18
10635; GFX9-NEXT:    s_addc_u32 s14, 0, s17
10636; GFX9-NEXT:    s_add_u32 s0, s0, s16
10637; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
10638; GFX9-NEXT:    s_addc_u32 s0, s14, s15
10639; GFX9-NEXT:    s_addc_u32 s1, s1, 0
10640; GFX9-NEXT:    s_mul_i32 s3, s2, s3
10641; GFX9-NEXT:    s_add_u32 s0, s0, s3
10642; GFX9-NEXT:    s_addc_u32 s1, 0, s1
10643; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
10644; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
10645; GFX9-NEXT:    s_addc_u32 s2, s2, s1
10646; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
10647; GFX9-NEXT:    s_add_u32 s0, s4, s14
10648; GFX9-NEXT:    s_mov_b32 s15, s14
10649; GFX9-NEXT:    s_addc_u32 s1, s5, s14
10650; GFX9-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
10651; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
10652; GFX9-NEXT:    s_mul_i32 s1, s4, s2
10653; GFX9-NEXT:    s_mul_hi_u32 s15, s4, s3
10654; GFX9-NEXT:    s_mul_hi_u32 s0, s4, s2
10655; GFX9-NEXT:    s_add_u32 s1, s15, s1
10656; GFX9-NEXT:    s_addc_u32 s0, 0, s0
10657; GFX9-NEXT:    s_mul_hi_u32 s16, s5, s3
10658; GFX9-NEXT:    s_mul_i32 s3, s5, s3
10659; GFX9-NEXT:    s_add_u32 s1, s1, s3
10660; GFX9-NEXT:    s_mul_hi_u32 s15, s5, s2
10661; GFX9-NEXT:    s_addc_u32 s0, s0, s16
10662; GFX9-NEXT:    s_addc_u32 s1, s15, 0
10663; GFX9-NEXT:    s_mul_i32 s2, s5, s2
10664; GFX9-NEXT:    s_add_u32 s0, s0, s2
10665; GFX9-NEXT:    s_addc_u32 s1, 0, s1
10666; GFX9-NEXT:    s_mul_i32 s1, s12, s1
10667; GFX9-NEXT:    s_mul_hi_u32 s2, s12, s0
10668; GFX9-NEXT:    s_add_i32 s1, s2, s1
10669; GFX9-NEXT:    s_mul_i32 s2, s13, s0
10670; GFX9-NEXT:    s_mul_i32 s0, s12, s0
10671; GFX9-NEXT:    s_add_i32 s15, s1, s2
10672; GFX9-NEXT:    v_mov_b32_e32 v0, s0
10673; GFX9-NEXT:    s_sub_i32 s1, s5, s15
10674; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
10675; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
10676; GFX9-NEXT:    s_subb_u32 s4, s1, s13
10677; GFX9-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s12, v0
10678; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
10679; GFX9-NEXT:    s_subb_u32 s16, s4, 0
10680; GFX9-NEXT:    s_cmp_ge_u32 s16, s13
10681; GFX9-NEXT:    s_cselect_b32 s17, -1, 0
10682; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v1
10683; GFX9-NEXT:    s_cmp_eq_u32 s16, s13
10684; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[2:3]
10685; GFX9-NEXT:    v_mov_b32_e32 v3, s17
10686; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
10687; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
10688; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[2:3]
10689; GFX9-NEXT:    s_subb_u32 s2, s4, s13
10690; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v1
10691; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
10692; GFX9-NEXT:    s_subb_u32 s0, s2, 0
10693; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
10694; GFX9-NEXT:    s_subb_u32 s2, s5, s15
10695; GFX9-NEXT:    s_cmp_ge_u32 s2, s13
10696; GFX9-NEXT:    v_mov_b32_e32 v5, s16
10697; GFX9-NEXT:    v_mov_b32_e32 v6, s0
10698; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
10699; GFX9-NEXT:    s_cselect_b32 s3, -1, 0
10700; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
10701; GFX9-NEXT:    s_cmp_eq_u32 s2, s13
10702; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[0:1]
10703; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
10704; GFX9-NEXT:    v_mov_b32_e32 v6, s3
10705; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
10706; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
10707; GFX9-NEXT:    s_ashr_i32 s0, s11, 31
10708; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
10709; GFX9-NEXT:    v_mov_b32_e32 v6, s2
10710; GFX9-NEXT:    s_add_u32 s2, s10, s0
10711; GFX9-NEXT:    s_mov_b32 s1, s0
10712; GFX9-NEXT:    s_addc_u32 s3, s11, s0
10713; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
10714; GFX9-NEXT:    s_xor_b64 s[4:5], s[2:3], s[0:1]
10715; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
10716; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s4
10717; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s5
10718; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
10719; GFX9-NEXT:    v_xor_b32_e32 v0, s14, v0
10720; GFX9-NEXT:    v_xor_b32_e32 v2, s14, v2
10721; GFX9-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v3
10722; GFX9-NEXT:    v_rcp_f32_e32 v3, v1
10723; GFX9-NEXT:    v_mov_b32_e32 v5, s14
10724; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s14, v0
10725; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v5, vcc
10726; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v3
10727; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
10728; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
10729; GFX9-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
10730; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
10731; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
10732; GFX9-NEXT:    s_sub_u32 s0, 0, s4
10733; GFX9-NEXT:    s_subb_u32 s1, 0, s5
10734; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
10735; GFX9-NEXT:    v_readfirstlane_b32 s11, v3
10736; GFX9-NEXT:    s_mul_hi_u32 s10, s0, s2
10737; GFX9-NEXT:    s_mul_i32 s12, s0, s11
10738; GFX9-NEXT:    s_mul_i32 s3, s1, s2
10739; GFX9-NEXT:    s_add_i32 s10, s10, s12
10740; GFX9-NEXT:    s_add_i32 s10, s10, s3
10741; GFX9-NEXT:    s_mul_i32 s13, s0, s2
10742; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s10
10743; GFX9-NEXT:    s_mul_i32 s12, s2, s10
10744; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s13
10745; GFX9-NEXT:    s_add_u32 s2, s2, s12
10746; GFX9-NEXT:    s_addc_u32 s3, 0, s3
10747; GFX9-NEXT:    s_mul_hi_u32 s14, s11, s13
10748; GFX9-NEXT:    s_mul_i32 s13, s11, s13
10749; GFX9-NEXT:    s_add_u32 s2, s2, s13
10750; GFX9-NEXT:    s_mul_hi_u32 s12, s11, s10
10751; GFX9-NEXT:    s_addc_u32 s2, s3, s14
10752; GFX9-NEXT:    s_addc_u32 s3, s12, 0
10753; GFX9-NEXT:    s_mul_i32 s10, s11, s10
10754; GFX9-NEXT:    s_add_u32 s2, s2, s10
10755; GFX9-NEXT:    s_addc_u32 s3, 0, s3
10756; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
10757; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
10758; GFX9-NEXT:    s_addc_u32 s2, s11, s3
10759; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
10760; GFX9-NEXT:    s_mul_i32 s3, s0, s2
10761; GFX9-NEXT:    s_mul_hi_u32 s11, s0, s10
10762; GFX9-NEXT:    s_add_i32 s3, s11, s3
10763; GFX9-NEXT:    s_mul_i32 s1, s1, s10
10764; GFX9-NEXT:    s_add_i32 s3, s3, s1
10765; GFX9-NEXT:    s_mul_i32 s0, s0, s10
10766; GFX9-NEXT:    s_mul_hi_u32 s11, s2, s0
10767; GFX9-NEXT:    s_mul_i32 s12, s2, s0
10768; GFX9-NEXT:    s_mul_i32 s14, s10, s3
10769; GFX9-NEXT:    s_mul_hi_u32 s0, s10, s0
10770; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s3
10771; GFX9-NEXT:    s_add_u32 s0, s0, s14
10772; GFX9-NEXT:    s_addc_u32 s10, 0, s13
10773; GFX9-NEXT:    s_add_u32 s0, s0, s12
10774; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
10775; GFX9-NEXT:    s_addc_u32 s0, s10, s11
10776; GFX9-NEXT:    s_addc_u32 s1, s1, 0
10777; GFX9-NEXT:    s_mul_i32 s3, s2, s3
10778; GFX9-NEXT:    s_add_u32 s0, s0, s3
10779; GFX9-NEXT:    s_addc_u32 s1, 0, s1
10780; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
10781; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
10782; GFX9-NEXT:    s_addc_u32 s2, s2, s1
10783; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
10784; GFX9-NEXT:    s_add_u32 s0, s6, s10
10785; GFX9-NEXT:    s_mov_b32 s11, s10
10786; GFX9-NEXT:    s_addc_u32 s1, s7, s10
10787; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
10788; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
10789; GFX9-NEXT:    s_mul_i32 s1, s6, s2
10790; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s3
10791; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s2
10792; GFX9-NEXT:    s_add_u32 s1, s11, s1
10793; GFX9-NEXT:    s_addc_u32 s0, 0, s0
10794; GFX9-NEXT:    s_mul_hi_u32 s12, s7, s3
10795; GFX9-NEXT:    s_mul_i32 s3, s7, s3
10796; GFX9-NEXT:    s_add_u32 s1, s1, s3
10797; GFX9-NEXT:    s_mul_hi_u32 s11, s7, s2
10798; GFX9-NEXT:    s_addc_u32 s0, s0, s12
10799; GFX9-NEXT:    s_addc_u32 s1, s11, 0
10800; GFX9-NEXT:    s_mul_i32 s2, s7, s2
10801; GFX9-NEXT:    s_add_u32 s0, s0, s2
10802; GFX9-NEXT:    s_addc_u32 s1, 0, s1
10803; GFX9-NEXT:    s_mul_i32 s1, s4, s1
10804; GFX9-NEXT:    s_mul_hi_u32 s2, s4, s0
10805; GFX9-NEXT:    s_add_i32 s1, s2, s1
10806; GFX9-NEXT:    s_mul_i32 s2, s5, s0
10807; GFX9-NEXT:    s_mul_i32 s0, s4, s0
10808; GFX9-NEXT:    s_add_i32 s11, s1, s2
10809; GFX9-NEXT:    v_mov_b32_e32 v2, s0
10810; GFX9-NEXT:    s_sub_i32 s1, s7, s11
10811; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s6, v2
10812; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
10813; GFX9-NEXT:    s_subb_u32 s6, s1, s5
10814; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s4, v2
10815; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
10816; GFX9-NEXT:    s_subb_u32 s12, s6, 0
10817; GFX9-NEXT:    s_cmp_ge_u32 s12, s5
10818; GFX9-NEXT:    s_cselect_b32 s13, -1, 0
10819; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s4, v3
10820; GFX9-NEXT:    s_cmp_eq_u32 s12, s5
10821; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[2:3]
10822; GFX9-NEXT:    v_mov_b32_e32 v6, s13
10823; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
10824; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
10825; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[2:3]
10826; GFX9-NEXT:    s_subb_u32 s2, s6, s5
10827; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s4, v3
10828; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
10829; GFX9-NEXT:    s_subb_u32 s0, s2, 0
10830; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
10831; GFX9-NEXT:    s_subb_u32 s2, s7, s11
10832; GFX9-NEXT:    s_cmp_ge_u32 s2, s5
10833; GFX9-NEXT:    v_mov_b32_e32 v7, s12
10834; GFX9-NEXT:    v_mov_b32_e32 v8, s0
10835; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
10836; GFX9-NEXT:    s_cselect_b32 s3, -1, 0
10837; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
10838; GFX9-NEXT:    s_cmp_eq_u32 s2, s5
10839; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v8, s[0:1]
10840; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
10841; GFX9-NEXT:    v_mov_b32_e32 v8, s3
10842; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
10843; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
10844; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
10845; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
10846; GFX9-NEXT:    v_mov_b32_e32 v8, s2
10847; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
10848; GFX9-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
10849; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
10850; GFX9-NEXT:    v_xor_b32_e32 v3, s10, v5
10851; GFX9-NEXT:    v_mov_b32_e32 v5, s10
10852; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s10, v2
10853; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
10854; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10855; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
10856; GFX9-NEXT:    s_endpgm
10857  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
10858  %r = srem <2 x i64> %x, %shl.y
10859  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
10860  ret void
10861}
10862