1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s
5
6define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
7; CHECK-LABEL: @udiv_i32(
8; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
9; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
10; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41F0000000000000
11; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
12; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
13; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[Y]] to i64
14; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP5]], [[TMP6]]
15; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
16; CHECK-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP7]], 32
17; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
18; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 0, [[TMP8]]
19; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP10]], 0
20; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]]
21; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
22; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP4]] to i64
23; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
24; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
25; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
26; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
27; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP4]], [[TMP19]]
28; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[TMP4]], [[TMP19]]
29; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP12]], i32 [[TMP20]], i32 [[TMP21]]
30; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP22]] to i64
31; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[X:%.*]] to i64
32; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
33; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
34; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
35; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
36; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[Y]]
37; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[X]], [[TMP29]]
38; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[Y]]
39; CHECK-NEXT:    [[TMP32:%.*]] = icmp uge i32 [[X]], [[TMP29]]
40; CHECK-NEXT:    [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
41; CHECK-NEXT:    [[TMP34:%.*]] = add i32 [[TMP28]], 1
42; CHECK-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP28]], 1
43; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP28]]
44; CHECK-NEXT:    [[TMP37:%.*]] = select i1 [[TMP32]], i32 [[TMP36]], i32 [[TMP35]]
45; CHECK-NEXT:    store i32 [[TMP37]], i32 addrspace(1)* [[OUT:%.*]]
46; CHECK-NEXT:    ret void
47;
48; GCN-LABEL: udiv_i32:
49; GCN:       ; %bb.0:
50; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
51; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
52; GCN-NEXT:    s_mov_b32 s7, 0xf000
53; GCN-NEXT:    s_mov_b32 s6, -1
54; GCN-NEXT:    s_waitcnt lgkmcnt(0)
55; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
56; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
57; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
58; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
59; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
60; GCN-NEXT:    v_mul_hi_u32 v2, v0, s9
61; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
62; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v2
63; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
64; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
65; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
66; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
67; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
68; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
69; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
70; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
71; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
72; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s8, v1
73; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, s8, v1
74; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
75; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
76; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
77; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
78; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
79; GCN-NEXT:    s_endpgm
80  %r = udiv i32 %x, %y
81  store i32 %r, i32 addrspace(1)* %out
82  ret void
83}
84
85define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
86; CHECK-LABEL: @urem_i32(
87; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
88; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
89; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41F0000000000000
90; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
91; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
92; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[Y]] to i64
93; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP5]], [[TMP6]]
94; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
95; CHECK-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP7]], 32
96; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
97; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 0, [[TMP8]]
98; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP10]], 0
99; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]]
100; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
101; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP4]] to i64
102; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
103; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
104; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
105; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
106; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP4]], [[TMP19]]
107; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[TMP4]], [[TMP19]]
108; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP12]], i32 [[TMP20]], i32 [[TMP21]]
109; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP22]] to i64
110; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[X:%.*]] to i64
111; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
112; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
113; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
114; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
115; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[Y]]
116; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[X]], [[TMP29]]
117; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[Y]]
118; CHECK-NEXT:    [[TMP32:%.*]] = icmp uge i32 [[X]], [[TMP29]]
119; CHECK-NEXT:    [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
120; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[Y]]
121; CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP30]], [[Y]]
122; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP30]]
123; CHECK-NEXT:    [[TMP37:%.*]] = select i1 [[TMP32]], i32 [[TMP36]], i32 [[TMP35]]
124; CHECK-NEXT:    store i32 [[TMP37]], i32 addrspace(1)* [[OUT:%.*]]
125; CHECK-NEXT:    ret void
126;
127; GCN-LABEL: urem_i32:
128; GCN:       ; %bb.0:
129; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
130; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
131; GCN-NEXT:    s_mov_b32 s7, 0xf000
132; GCN-NEXT:    s_mov_b32 s6, -1
133; GCN-NEXT:    s_waitcnt lgkmcnt(0)
134; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
135; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
136; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
137; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
138; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
139; GCN-NEXT:    v_mul_hi_u32 v2, v0, s9
140; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
141; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v2
142; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
143; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
144; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
145; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
146; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
147; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
148; GCN-NEXT:    v_mul_lo_u32 v0, v0, s9
149; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s8, v0
150; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s8, v0
151; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v1
152; GCN-NEXT:    v_add_i32_e32 v2, vcc, s9, v1
153; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s9, v1
154; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
155; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
156; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[2:3]
157; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
158; GCN-NEXT:    s_endpgm
159  %r = urem i32 %x, %y
160  store i32 %r, i32 addrspace(1)* %out
161  ret void
162}
163
164define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
165; CHECK-LABEL: @sdiv_i32(
166; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
167; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
168; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
169; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[X]], [[TMP1]]
170; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]]
171; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]]
172; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]]
173; CHECK-NEXT:    [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float
174; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]])
175; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41F0000000000000
176; CHECK-NEXT:    [[TMP11:%.*]] = fptoui float [[TMP10]] to i32
177; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
178; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP7]] to i64
179; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP12]], [[TMP13]]
180; CHECK-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
181; CHECK-NEXT:    [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
182; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
183; CHECK-NEXT:    [[TMP18:%.*]] = sub i32 0, [[TMP15]]
184; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
185; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP18]], i32 [[TMP15]]
186; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP20]] to i64
187; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP11]] to i64
188; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]]
189; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
190; CHECK-NEXT:    [[TMP25:%.*]] = lshr i64 [[TMP23]], 32
191; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
192; CHECK-NEXT:    [[TMP27:%.*]] = add i32 [[TMP11]], [[TMP26]]
193; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP11]], [[TMP26]]
194; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP19]], i32 [[TMP27]], i32 [[TMP28]]
195; CHECK-NEXT:    [[TMP30:%.*]] = zext i32 [[TMP29]] to i64
196; CHECK-NEXT:    [[TMP31:%.*]] = zext i32 [[TMP6]] to i64
197; CHECK-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP30]], [[TMP31]]
198; CHECK-NEXT:    [[TMP33:%.*]] = trunc i64 [[TMP32]] to i32
199; CHECK-NEXT:    [[TMP34:%.*]] = lshr i64 [[TMP32]], 32
200; CHECK-NEXT:    [[TMP35:%.*]] = trunc i64 [[TMP34]] to i32
201; CHECK-NEXT:    [[TMP36:%.*]] = mul i32 [[TMP35]], [[TMP7]]
202; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP6]], [[TMP36]]
203; CHECK-NEXT:    [[TMP38:%.*]] = icmp uge i32 [[TMP37]], [[TMP7]]
204; CHECK-NEXT:    [[TMP39:%.*]] = icmp uge i32 [[TMP6]], [[TMP36]]
205; CHECK-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
206; CHECK-NEXT:    [[TMP41:%.*]] = add i32 [[TMP35]], 1
207; CHECK-NEXT:    [[TMP42:%.*]] = sub i32 [[TMP35]], 1
208; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP40]], i32 [[TMP41]], i32 [[TMP35]]
209; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP39]], i32 [[TMP43]], i32 [[TMP42]]
210; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP44]], [[TMP3]]
211; CHECK-NEXT:    [[TMP46:%.*]] = sub i32 [[TMP45]], [[TMP3]]
212; CHECK-NEXT:    store i32 [[TMP46]], i32 addrspace(1)* [[OUT:%.*]]
213; CHECK-NEXT:    ret void
214;
215; GCN-LABEL: sdiv_i32:
216; GCN:       ; %bb.0:
217; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
218; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
219; GCN-NEXT:    s_mov_b32 s7, 0xf000
220; GCN-NEXT:    s_mov_b32 s6, -1
221; GCN-NEXT:    s_waitcnt lgkmcnt(0)
222; GCN-NEXT:    s_ashr_i32 s8, s3, 31
223; GCN-NEXT:    s_add_i32 s3, s3, s8
224; GCN-NEXT:    s_xor_b32 s9, s3, s8
225; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
226; GCN-NEXT:    s_ashr_i32 s3, s2, 31
227; GCN-NEXT:    s_add_i32 s2, s2, s3
228; GCN-NEXT:    s_xor_b32 s2, s2, s3
229; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
230; GCN-NEXT:    s_xor_b32 s3, s3, s8
231; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
232; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
233; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
234; GCN-NEXT:    v_mul_hi_u32 v2, v0, s9
235; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
236; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
237; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
238; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
239; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
240; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
241; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
242; GCN-NEXT:    v_mul_hi_u32 v0, v0, s2
243; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
244; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
245; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
246; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s2, v1
247; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, s2, v1
248; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
249; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
250; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
251; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
252; GCN-NEXT:    v_xor_b32_e32 v0, s3, v0
253; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
254; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
255; GCN-NEXT:    s_endpgm
256  %r = sdiv i32 %x, %y
257  store i32 %r, i32 addrspace(1)* %out
258  ret void
259}
260
261define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
262; CHECK-LABEL: @srem_i32(
263; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
264; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
265; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[X]], [[TMP1]]
266; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]]
267; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]]
268; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]]
269; CHECK-NEXT:    [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float
270; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
271; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41F0000000000000
272; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP9]] to i32
273; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
274; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP6]] to i64
275; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP11]], [[TMP12]]
276; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
277; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP13]], 32
278; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
279; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 0, [[TMP14]]
280; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
281; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP17]], i32 [[TMP14]]
282; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
283; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP10]] to i64
284; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]]
285; CHECK-NEXT:    [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32
286; CHECK-NEXT:    [[TMP24:%.*]] = lshr i64 [[TMP22]], 32
287; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
288; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[TMP10]], [[TMP25]]
289; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP10]], [[TMP25]]
290; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP18]], i32 [[TMP26]], i32 [[TMP27]]
291; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
292; CHECK-NEXT:    [[TMP30:%.*]] = zext i32 [[TMP5]] to i64
293; CHECK-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP29]], [[TMP30]]
294; CHECK-NEXT:    [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
295; CHECK-NEXT:    [[TMP33:%.*]] = lshr i64 [[TMP31]], 32
296; CHECK-NEXT:    [[TMP34:%.*]] = trunc i64 [[TMP33]] to i32
297; CHECK-NEXT:    [[TMP35:%.*]] = mul i32 [[TMP34]], [[TMP6]]
298; CHECK-NEXT:    [[TMP36:%.*]] = sub i32 [[TMP5]], [[TMP35]]
299; CHECK-NEXT:    [[TMP37:%.*]] = icmp uge i32 [[TMP36]], [[TMP6]]
300; CHECK-NEXT:    [[TMP38:%.*]] = icmp uge i32 [[TMP5]], [[TMP35]]
301; CHECK-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
302; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP36]], [[TMP6]]
303; CHECK-NEXT:    [[TMP41:%.*]] = add i32 [[TMP36]], [[TMP6]]
304; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP39]], i32 [[TMP40]], i32 [[TMP36]]
305; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP38]], i32 [[TMP42]], i32 [[TMP41]]
306; CHECK-NEXT:    [[TMP44:%.*]] = xor i32 [[TMP43]], [[TMP1]]
307; CHECK-NEXT:    [[TMP45:%.*]] = sub i32 [[TMP44]], [[TMP1]]
308; CHECK-NEXT:    store i32 [[TMP45]], i32 addrspace(1)* [[OUT:%.*]]
309; CHECK-NEXT:    ret void
310;
311; GCN-LABEL: srem_i32:
312; GCN:       ; %bb.0:
313; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
314; GCN-NEXT:    s_mov_b32 s7, 0xf000
315; GCN-NEXT:    s_mov_b32 s6, -1
316; GCN-NEXT:    s_waitcnt lgkmcnt(0)
317; GCN-NEXT:    s_ashr_i32 s2, s5, 31
318; GCN-NEXT:    s_add_i32 s3, s5, s2
319; GCN-NEXT:    s_xor_b32 s10, s3, s2
320; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
321; GCN-NEXT:    s_ashr_i32 s8, s4, 31
322; GCN-NEXT:    s_add_i32 s4, s4, s8
323; GCN-NEXT:    s_xor_b32 s9, s4, s8
324; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
325; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
326; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
327; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
328; GCN-NEXT:    v_mul_lo_u32 v1, v0, s10
329; GCN-NEXT:    v_mul_hi_u32 v2, v0, s10
330; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
331; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v2
332; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
333; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
334; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
335; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
336; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
337; GCN-NEXT:    v_mul_hi_u32 v0, v0, s9
338; GCN-NEXT:    v_mul_lo_u32 v0, v0, s10
339; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s9, v0
340; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s9, v0
341; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v1
342; GCN-NEXT:    v_add_i32_e32 v2, vcc, s10, v1
343; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s10, v1
344; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
345; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
346; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[2:3]
347; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
348; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
349; GCN-NEXT:    s_waitcnt lgkmcnt(0)
350; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
351; GCN-NEXT:    s_endpgm
352  %r = srem i32 %x, %y
353  store i32 %r, i32 addrspace(1)* %out
354  ret void
355}
356
357define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
358; CHECK-LABEL: @udiv_i16(
359; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
360; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
361; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
362; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
363; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
364; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
365; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
366; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
367; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
368; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
369; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
370; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
371; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
372; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
373; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
374; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
375; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
376; CHECK-NEXT:    store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]]
377; CHECK-NEXT:    ret void
378;
379; GCN-LABEL: udiv_i16:
380; GCN:       ; %bb.0:
381; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
382; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
383; GCN-NEXT:    s_waitcnt lgkmcnt(0)
384; GCN-NEXT:    s_lshr_b32 s3, s2, 16
385; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
386; GCN-NEXT:    s_and_b32 s2, s2, 0xffff
387; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
388; GCN-NEXT:    s_mov_b32 s3, 0xf000
389; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
390; GCN-NEXT:    s_mov_b32 s2, -1
391; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
392; GCN-NEXT:    v_trunc_f32_e32 v2, v2
393; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
394; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
395; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
396; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
397; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
398; GCN-NEXT:    s_endpgm
399  %r = udiv i16 %x, %y
400  store i16 %r, i16 addrspace(1)* %out
401  ret void
402}
403
404define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
405; CHECK-LABEL: @urem_i16(
406; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
407; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
408; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
409; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
410; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
411; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
412; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
413; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
414; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
415; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
416; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
417; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
418; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
419; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
420; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
421; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
422; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
423; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
424; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
425; CHECK-NEXT:    store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]]
426; CHECK-NEXT:    ret void
427;
428; GCN-LABEL: urem_i16:
429; GCN:       ; %bb.0:
430; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
431; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
432; GCN-NEXT:    s_waitcnt lgkmcnt(0)
433; GCN-NEXT:    s_lshr_b32 s2, s4, 16
434; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
435; GCN-NEXT:    s_and_b32 s3, s4, 0xffff
436; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
437; GCN-NEXT:    s_mov_b32 s3, 0xf000
438; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
439; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
440; GCN-NEXT:    v_trunc_f32_e32 v2, v2
441; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
442; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
443; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
444; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
445; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
446; GCN-NEXT:    s_mov_b32 s2, -1
447; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
448; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
449; GCN-NEXT:    s_endpgm
450  %r = urem i16 %x, %y
451  store i16 %r, i16 addrspace(1)* %out
452  ret void
453}
454
455define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
456; CHECK-LABEL: @sdiv_i16(
457; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
458; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
459; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
460; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
461; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
462; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
463; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
464; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
465; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
466; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
467; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
468; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
469; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
470; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
471; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
472; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
473; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
474; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
475; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 16
476; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 16
477; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
478; CHECK-NEXT:    store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]]
479; CHECK-NEXT:    ret void
480;
481; GCN-LABEL: sdiv_i16:
482; GCN:       ; %bb.0:
483; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
484; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
485; GCN-NEXT:    s_mov_b32 s7, 0xf000
486; GCN-NEXT:    s_mov_b32 s6, -1
487; GCN-NEXT:    s_waitcnt lgkmcnt(0)
488; GCN-NEXT:    s_ashr_i32 s1, s0, 16
489; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
490; GCN-NEXT:    s_sext_i32_i16 s0, s0
491; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
492; GCN-NEXT:    s_xor_b32 s0, s0, s1
493; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
494; GCN-NEXT:    s_ashr_i32 s0, s0, 30
495; GCN-NEXT:    s_or_b32 s0, s0, 1
496; GCN-NEXT:    v_mov_b32_e32 v3, s0
497; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
498; GCN-NEXT:    v_trunc_f32_e32 v2, v2
499; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
500; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
501; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
502; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
503; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
504; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
505; GCN-NEXT:    s_endpgm
506  %r = sdiv i16 %x, %y
507  store i16 %r, i16 addrspace(1)* %out
508  ret void
509}
510
511define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
512; CHECK-LABEL: @srem_i16(
513; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
514; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
515; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
516; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
517; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
518; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
519; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
520; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
521; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
522; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
523; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
524; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
525; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
526; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
527; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
528; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
529; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
530; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
531; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
532; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
533; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
534; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
535; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
536; CHECK-NEXT:    store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]]
537; CHECK-NEXT:    ret void
538;
539; GCN-LABEL: srem_i16:
540; GCN:       ; %bb.0:
541; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
542; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
543; GCN-NEXT:    s_waitcnt lgkmcnt(0)
544; GCN-NEXT:    s_ashr_i32 s2, s4, 16
545; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
546; GCN-NEXT:    s_sext_i32_i16 s3, s4
547; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
548; GCN-NEXT:    s_xor_b32 s3, s3, s2
549; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
550; GCN-NEXT:    s_ashr_i32 s3, s3, 30
551; GCN-NEXT:    s_or_b32 s3, s3, 1
552; GCN-NEXT:    v_mov_b32_e32 v3, s3
553; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
554; GCN-NEXT:    v_trunc_f32_e32 v2, v2
555; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
556; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
557; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
558; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
559; GCN-NEXT:    s_mov_b32 s3, 0xf000
560; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
561; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
562; GCN-NEXT:    s_mov_b32 s2, -1
563; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
564; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
565; GCN-NEXT:    s_endpgm
566  %r = srem i16 %x, %y
567  store i16 %r, i16 addrspace(1)* %out
568  ret void
569}
570
571define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
572; CHECK-LABEL: @udiv_i8(
573; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
574; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
575; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
576; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
577; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
578; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
579; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
580; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
581; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
582; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
583; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
584; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
585; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
586; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
587; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
588; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 255
589; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
590; CHECK-NEXT:    store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]]
591; CHECK-NEXT:    ret void
592;
593; GCN-LABEL: udiv_i8:
594; GCN:       ; %bb.0:
595; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
596; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
597; GCN-NEXT:    s_mov_b32 s7, 0xf000
598; GCN-NEXT:    s_mov_b32 s6, -1
599; GCN-NEXT:    s_waitcnt lgkmcnt(0)
600; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v0, s0
601; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
602; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
603; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
604; GCN-NEXT:    v_trunc_f32_e32 v1, v1
605; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
606; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
607; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
608; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
609; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
610; GCN-NEXT:    s_endpgm
611  %r = udiv i8 %x, %y
612  store i8 %r, i8 addrspace(1)* %out
613  ret void
614}
615
616define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
617; CHECK-LABEL: @urem_i8(
618; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
619; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
620; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
621; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
622; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
623; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
624; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
625; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
626; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
627; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
628; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
629; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
630; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
631; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
632; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
633; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
634; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
635; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 255
636; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8
637; CHECK-NEXT:    store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]]
638; CHECK-NEXT:    ret void
639;
640; GCN-LABEL: urem_i8:
641; GCN:       ; %bb.0:
642; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
643; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
644; GCN-NEXT:    s_mov_b32 s3, 0xf000
645; GCN-NEXT:    s_waitcnt lgkmcnt(0)
646; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v0, s4
647; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
648; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
649; GCN-NEXT:    s_lshr_b32 s2, s4, 8
650; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
651; GCN-NEXT:    v_trunc_f32_e32 v1, v1
652; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
653; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
654; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
655; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
656; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
657; GCN-NEXT:    s_mov_b32 s2, -1
658; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
659; GCN-NEXT:    buffer_store_byte v0, off, s[0:3], 0
660; GCN-NEXT:    s_endpgm
661  %r = urem i8 %x, %y
662  store i8 %r, i8 addrspace(1)* %out
663  ret void
664}
665
666define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
667; CHECK-LABEL: @sdiv_i8(
668; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
669; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
670; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
671; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
672; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
673; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
674; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
675; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
676; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
677; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
678; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
679; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
680; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
681; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
682; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
683; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
684; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
685; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
686; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 24
687; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 24
688; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
689; CHECK-NEXT:    store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]]
690; CHECK-NEXT:    ret void
691;
692; GCN-LABEL: sdiv_i8:
693; GCN:       ; %bb.0:
694; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
695; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
696; GCN-NEXT:    s_mov_b32 s7, 0xf000
697; GCN-NEXT:    s_mov_b32 s6, -1
698; GCN-NEXT:    s_waitcnt lgkmcnt(0)
699; GCN-NEXT:    s_bfe_i32 s1, s0, 0x80008
700; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
701; GCN-NEXT:    s_sext_i32_i8 s0, s0
702; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
703; GCN-NEXT:    s_xor_b32 s0, s0, s1
704; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
705; GCN-NEXT:    s_ashr_i32 s0, s0, 30
706; GCN-NEXT:    s_or_b32 s0, s0, 1
707; GCN-NEXT:    v_mov_b32_e32 v3, s0
708; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
709; GCN-NEXT:    v_trunc_f32_e32 v2, v2
710; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
711; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
712; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
713; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
714; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
715; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
716; GCN-NEXT:    s_endpgm
717  %r = sdiv i8 %x, %y
718  store i8 %r, i8 addrspace(1)* %out
719  ret void
720}
721
722define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
723; CHECK-LABEL: @srem_i8(
724; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
725; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
726; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
727; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
728; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
729; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
730; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
731; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
732; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
733; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
734; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
735; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
736; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
737; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
738; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
739; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
740; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
741; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
742; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
743; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
744; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 24
745; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 24
746; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8
747; CHECK-NEXT:    store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]]
748; CHECK-NEXT:    ret void
749;
750; GCN-LABEL: srem_i8:
751; GCN:       ; %bb.0:
752; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
753; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
754; GCN-NEXT:    s_mov_b32 s7, 0xf000
755; GCN-NEXT:    s_mov_b32 s6, -1
756; GCN-NEXT:    s_waitcnt lgkmcnt(0)
757; GCN-NEXT:    s_bfe_i32 s1, s0, 0x80008
758; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
759; GCN-NEXT:    s_sext_i32_i8 s3, s0
760; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
761; GCN-NEXT:    s_xor_b32 s1, s3, s1
762; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
763; GCN-NEXT:    s_ashr_i32 s1, s1, 30
764; GCN-NEXT:    s_or_b32 s1, s1, 1
765; GCN-NEXT:    v_mov_b32_e32 v3, s1
766; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
767; GCN-NEXT:    v_trunc_f32_e32 v2, v2
768; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
769; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
770; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
771; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
772; GCN-NEXT:    s_lshr_b32 s2, s0, 8
773; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
774; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
775; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
776; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
777; GCN-NEXT:    s_endpgm
778  %r = srem i8 %x, %y
779  store i8 %r, i8 addrspace(1)* %out
780  ret void
781}
782
783define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
784; CHECK-LABEL: @udiv_v4i32(
785; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
786; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
787; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
788; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
789; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41F0000000000000
790; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
791; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
792; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP2]] to i64
793; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
794; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
795; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
796; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
797; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP10]]
798; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
799; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP10]]
800; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
801; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP6]] to i64
802; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
803; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
804; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
805; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
806; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP6]], [[TMP21]]
807; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP6]], [[TMP21]]
808; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP14]], i32 [[TMP22]], i32 [[TMP23]]
809; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
810; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP1]] to i64
811; CHECK-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP25]], [[TMP26]]
812; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
813; CHECK-NEXT:    [[TMP29:%.*]] = lshr i64 [[TMP27]], 32
814; CHECK-NEXT:    [[TMP30:%.*]] = trunc i64 [[TMP29]] to i32
815; CHECK-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], [[TMP2]]
816; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP1]], [[TMP31]]
817; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP2]]
818; CHECK-NEXT:    [[TMP34:%.*]] = icmp uge i32 [[TMP1]], [[TMP31]]
819; CHECK-NEXT:    [[TMP35:%.*]] = and i1 [[TMP33]], [[TMP34]]
820; CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP30]], 1
821; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP30]], 1
822; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP35]], i32 [[TMP36]], i32 [[TMP30]]
823; CHECK-NEXT:    [[TMP39:%.*]] = select i1 [[TMP34]], i32 [[TMP38]], i32 [[TMP37]]
824; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i32> undef, i32 [[TMP39]], i64 0
825; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i32> [[X]], i64 1
826; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[Y]], i64 1
827; CHECK-NEXT:    [[TMP43:%.*]] = uitofp i32 [[TMP42]] to float
828; CHECK-NEXT:    [[TMP44:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP43]])
829; CHECK-NEXT:    [[TMP45:%.*]] = fmul fast float [[TMP44]], 0x41F0000000000000
830; CHECK-NEXT:    [[TMP46:%.*]] = fptoui float [[TMP45]] to i32
831; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP46]] to i64
832; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP42]] to i64
833; CHECK-NEXT:    [[TMP49:%.*]] = mul i64 [[TMP47]], [[TMP48]]
834; CHECK-NEXT:    [[TMP50:%.*]] = trunc i64 [[TMP49]] to i32
835; CHECK-NEXT:    [[TMP51:%.*]] = lshr i64 [[TMP49]], 32
836; CHECK-NEXT:    [[TMP52:%.*]] = trunc i64 [[TMP51]] to i32
837; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 0, [[TMP50]]
838; CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i32 [[TMP52]], 0
839; CHECK-NEXT:    [[TMP55:%.*]] = select i1 [[TMP54]], i32 [[TMP53]], i32 [[TMP50]]
840; CHECK-NEXT:    [[TMP56:%.*]] = zext i32 [[TMP55]] to i64
841; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP46]] to i64
842; CHECK-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP56]], [[TMP57]]
843; CHECK-NEXT:    [[TMP59:%.*]] = trunc i64 [[TMP58]] to i32
844; CHECK-NEXT:    [[TMP60:%.*]] = lshr i64 [[TMP58]], 32
845; CHECK-NEXT:    [[TMP61:%.*]] = trunc i64 [[TMP60]] to i32
846; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP46]], [[TMP61]]
847; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP46]], [[TMP61]]
848; CHECK-NEXT:    [[TMP64:%.*]] = select i1 [[TMP54]], i32 [[TMP62]], i32 [[TMP63]]
849; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP64]] to i64
850; CHECK-NEXT:    [[TMP66:%.*]] = zext i32 [[TMP41]] to i64
851; CHECK-NEXT:    [[TMP67:%.*]] = mul i64 [[TMP65]], [[TMP66]]
852; CHECK-NEXT:    [[TMP68:%.*]] = trunc i64 [[TMP67]] to i32
853; CHECK-NEXT:    [[TMP69:%.*]] = lshr i64 [[TMP67]], 32
854; CHECK-NEXT:    [[TMP70:%.*]] = trunc i64 [[TMP69]] to i32
855; CHECK-NEXT:    [[TMP71:%.*]] = mul i32 [[TMP70]], [[TMP42]]
856; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP41]], [[TMP71]]
857; CHECK-NEXT:    [[TMP73:%.*]] = icmp uge i32 [[TMP72]], [[TMP42]]
858; CHECK-NEXT:    [[TMP74:%.*]] = icmp uge i32 [[TMP41]], [[TMP71]]
859; CHECK-NEXT:    [[TMP75:%.*]] = and i1 [[TMP73]], [[TMP74]]
860; CHECK-NEXT:    [[TMP76:%.*]] = add i32 [[TMP70]], 1
861; CHECK-NEXT:    [[TMP77:%.*]] = sub i32 [[TMP70]], 1
862; CHECK-NEXT:    [[TMP78:%.*]] = select i1 [[TMP75]], i32 [[TMP76]], i32 [[TMP70]]
863; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP74]], i32 [[TMP78]], i32 [[TMP77]]
864; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP79]], i64 1
865; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <4 x i32> [[X]], i64 2
866; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i32> [[Y]], i64 2
867; CHECK-NEXT:    [[TMP83:%.*]] = uitofp i32 [[TMP82]] to float
868; CHECK-NEXT:    [[TMP84:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP83]])
869; CHECK-NEXT:    [[TMP85:%.*]] = fmul fast float [[TMP84]], 0x41F0000000000000
870; CHECK-NEXT:    [[TMP86:%.*]] = fptoui float [[TMP85]] to i32
871; CHECK-NEXT:    [[TMP87:%.*]] = zext i32 [[TMP86]] to i64
872; CHECK-NEXT:    [[TMP88:%.*]] = zext i32 [[TMP82]] to i64
873; CHECK-NEXT:    [[TMP89:%.*]] = mul i64 [[TMP87]], [[TMP88]]
874; CHECK-NEXT:    [[TMP90:%.*]] = trunc i64 [[TMP89]] to i32
875; CHECK-NEXT:    [[TMP91:%.*]] = lshr i64 [[TMP89]], 32
876; CHECK-NEXT:    [[TMP92:%.*]] = trunc i64 [[TMP91]] to i32
877; CHECK-NEXT:    [[TMP93:%.*]] = sub i32 0, [[TMP90]]
878; CHECK-NEXT:    [[TMP94:%.*]] = icmp eq i32 [[TMP92]], 0
879; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP93]], i32 [[TMP90]]
880; CHECK-NEXT:    [[TMP96:%.*]] = zext i32 [[TMP95]] to i64
881; CHECK-NEXT:    [[TMP97:%.*]] = zext i32 [[TMP86]] to i64
882; CHECK-NEXT:    [[TMP98:%.*]] = mul i64 [[TMP96]], [[TMP97]]
883; CHECK-NEXT:    [[TMP99:%.*]] = trunc i64 [[TMP98]] to i32
884; CHECK-NEXT:    [[TMP100:%.*]] = lshr i64 [[TMP98]], 32
885; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
886; CHECK-NEXT:    [[TMP102:%.*]] = add i32 [[TMP86]], [[TMP101]]
887; CHECK-NEXT:    [[TMP103:%.*]] = sub i32 [[TMP86]], [[TMP101]]
888; CHECK-NEXT:    [[TMP104:%.*]] = select i1 [[TMP94]], i32 [[TMP102]], i32 [[TMP103]]
889; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP104]] to i64
890; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP81]] to i64
891; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
892; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
893; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
894; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
895; CHECK-NEXT:    [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP82]]
896; CHECK-NEXT:    [[TMP112:%.*]] = sub i32 [[TMP81]], [[TMP111]]
897; CHECK-NEXT:    [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP82]]
898; CHECK-NEXT:    [[TMP114:%.*]] = icmp uge i32 [[TMP81]], [[TMP111]]
899; CHECK-NEXT:    [[TMP115:%.*]] = and i1 [[TMP113]], [[TMP114]]
900; CHECK-NEXT:    [[TMP116:%.*]] = add i32 [[TMP110]], 1
901; CHECK-NEXT:    [[TMP117:%.*]] = sub i32 [[TMP110]], 1
902; CHECK-NEXT:    [[TMP118:%.*]] = select i1 [[TMP115]], i32 [[TMP116]], i32 [[TMP110]]
903; CHECK-NEXT:    [[TMP119:%.*]] = select i1 [[TMP114]], i32 [[TMP118]], i32 [[TMP117]]
904; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <4 x i32> [[TMP80]], i32 [[TMP119]], i64 2
905; CHECK-NEXT:    [[TMP121:%.*]] = extractelement <4 x i32> [[X]], i64 3
906; CHECK-NEXT:    [[TMP122:%.*]] = extractelement <4 x i32> [[Y]], i64 3
907; CHECK-NEXT:    [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float
908; CHECK-NEXT:    [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]])
909; CHECK-NEXT:    [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41F0000000000000
910; CHECK-NEXT:    [[TMP126:%.*]] = fptoui float [[TMP125]] to i32
911; CHECK-NEXT:    [[TMP127:%.*]] = zext i32 [[TMP126]] to i64
912; CHECK-NEXT:    [[TMP128:%.*]] = zext i32 [[TMP122]] to i64
913; CHECK-NEXT:    [[TMP129:%.*]] = mul i64 [[TMP127]], [[TMP128]]
914; CHECK-NEXT:    [[TMP130:%.*]] = trunc i64 [[TMP129]] to i32
915; CHECK-NEXT:    [[TMP131:%.*]] = lshr i64 [[TMP129]], 32
916; CHECK-NEXT:    [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32
917; CHECK-NEXT:    [[TMP133:%.*]] = sub i32 0, [[TMP130]]
918; CHECK-NEXT:    [[TMP134:%.*]] = icmp eq i32 [[TMP132]], 0
919; CHECK-NEXT:    [[TMP135:%.*]] = select i1 [[TMP134]], i32 [[TMP133]], i32 [[TMP130]]
920; CHECK-NEXT:    [[TMP136:%.*]] = zext i32 [[TMP135]] to i64
921; CHECK-NEXT:    [[TMP137:%.*]] = zext i32 [[TMP126]] to i64
922; CHECK-NEXT:    [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]]
923; CHECK-NEXT:    [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32
924; CHECK-NEXT:    [[TMP140:%.*]] = lshr i64 [[TMP138]], 32
925; CHECK-NEXT:    [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32
926; CHECK-NEXT:    [[TMP142:%.*]] = add i32 [[TMP126]], [[TMP141]]
927; CHECK-NEXT:    [[TMP143:%.*]] = sub i32 [[TMP126]], [[TMP141]]
928; CHECK-NEXT:    [[TMP144:%.*]] = select i1 [[TMP134]], i32 [[TMP142]], i32 [[TMP143]]
929; CHECK-NEXT:    [[TMP145:%.*]] = zext i32 [[TMP144]] to i64
930; CHECK-NEXT:    [[TMP146:%.*]] = zext i32 [[TMP121]] to i64
931; CHECK-NEXT:    [[TMP147:%.*]] = mul i64 [[TMP145]], [[TMP146]]
932; CHECK-NEXT:    [[TMP148:%.*]] = trunc i64 [[TMP147]] to i32
933; CHECK-NEXT:    [[TMP149:%.*]] = lshr i64 [[TMP147]], 32
934; CHECK-NEXT:    [[TMP150:%.*]] = trunc i64 [[TMP149]] to i32
935; CHECK-NEXT:    [[TMP151:%.*]] = mul i32 [[TMP150]], [[TMP122]]
936; CHECK-NEXT:    [[TMP152:%.*]] = sub i32 [[TMP121]], [[TMP151]]
937; CHECK-NEXT:    [[TMP153:%.*]] = icmp uge i32 [[TMP152]], [[TMP122]]
938; CHECK-NEXT:    [[TMP154:%.*]] = icmp uge i32 [[TMP121]], [[TMP151]]
939; CHECK-NEXT:    [[TMP155:%.*]] = and i1 [[TMP153]], [[TMP154]]
940; CHECK-NEXT:    [[TMP156:%.*]] = add i32 [[TMP150]], 1
941; CHECK-NEXT:    [[TMP157:%.*]] = sub i32 [[TMP150]], 1
942; CHECK-NEXT:    [[TMP158:%.*]] = select i1 [[TMP155]], i32 [[TMP156]], i32 [[TMP150]]
943; CHECK-NEXT:    [[TMP159:%.*]] = select i1 [[TMP154]], i32 [[TMP158]], i32 [[TMP157]]
944; CHECK-NEXT:    [[TMP160:%.*]] = insertelement <4 x i32> [[TMP120]], i32 [[TMP159]], i64 3
945; CHECK-NEXT:    store <4 x i32> [[TMP160]], <4 x i32> addrspace(1)* [[OUT:%.*]]
946; CHECK-NEXT:    ret void
947;
948; GCN-LABEL: udiv_v4i32:
949; GCN:       ; %bb.0:
950; GCN-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0xd
951; GCN-NEXT:    s_mov_b32 s6, 0x4f800000
952; GCN-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x9
953; GCN-NEXT:    s_mov_b32 s19, 0xf000
954; GCN-NEXT:    s_mov_b32 s18, -1
955; GCN-NEXT:    s_waitcnt lgkmcnt(0)
956; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
957; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
958; GCN-NEXT:    v_cvt_f32_u32_e32 v7, s15
959; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
960; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
961; GCN-NEXT:    v_mul_f32_e32 v0, s6, v0
962; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
963; GCN-NEXT:    v_mul_f32_e32 v1, s6, v1
964; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
965; GCN-NEXT:    v_mul_hi_u32 v2, v0, s12
966; GCN-NEXT:    v_mul_lo_u32 v3, v0, s12
967; GCN-NEXT:    v_mul_hi_u32 v4, v1, s13
968; GCN-NEXT:    v_mul_lo_u32 v5, v1, s13
969; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
970; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v3
971; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v6, s[0:1]
972; GCN-NEXT:    v_mul_hi_u32 v2, v2, v0
973; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v5
974; GCN-NEXT:    v_add_i32_e32 v6, vcc, v2, v0
975; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
976; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
977; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
978; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
979; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
980; GCN-NEXT:    v_mul_hi_u32 v2, v2, v1
981; GCN-NEXT:    v_mul_lo_u32 v3, v0, s12
982; GCN-NEXT:    v_add_i32_e32 v4, vcc, -1, v0
983; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s8, v3
984; GCN-NEXT:    v_cmp_le_u32_e64 s[4:5], s12, v5
985; GCN-NEXT:    v_add_i32_e32 v5, vcc, v2, v1
986; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v2, v1
987; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s14
988; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
989; GCN-NEXT:    v_mul_hi_u32 v1, v1, s9
990; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s8, v3
991; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
992; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
993; GCN-NEXT:    s_and_b64 vcc, s[4:5], s[2:3]
994; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
995; GCN-NEXT:    v_mul_f32_e32 v2, s6, v2
996; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
997; GCN-NEXT:    v_mul_lo_u32 v3, v1, s13
998; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[2:3]
999; GCN-NEXT:    v_mul_hi_u32 v6, v2, s14
1000; GCN-NEXT:    v_mul_lo_u32 v5, v2, s14
1001; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s9, v3
1002; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s9, v3
1003; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v6
1004; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v5
1005; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
1006; GCN-NEXT:    v_mul_hi_u32 v3, v3, v2
1007; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
1008; GCN-NEXT:    v_add_i32_e32 v4, vcc, -1, v1
1009; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v1
1010; GCN-NEXT:    v_add_i32_e32 v6, vcc, v3, v2
1011; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v3, v2
1012; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v7
1013; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
1014; GCN-NEXT:    v_mul_hi_u32 v2, v2, s10
1015; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1016; GCN-NEXT:    v_mul_f32_e32 v3, s6, v3
1017; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1018; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1019; GCN-NEXT:    v_mul_lo_u32 v5, v2, s14
1020; GCN-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[2:3]
1021; GCN-NEXT:    v_mul_hi_u32 v7, v3, s15
1022; GCN-NEXT:    v_mul_lo_u32 v6, v3, s15
1023; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s10, v5
1024; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v4
1025; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v7
1026; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v6
1027; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[2:3]
1028; GCN-NEXT:    v_mul_hi_u32 v4, v4, v3
1029; GCN-NEXT:    v_add_i32_e32 v6, vcc, -1, v2
1030; GCN-NEXT:    v_add_i32_e32 v7, vcc, v4, v3
1031; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v4, v3
1032; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
1033; GCN-NEXT:    v_mul_hi_u32 v3, v3, s11
1034; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s10, v5
1035; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
1036; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1037; GCN-NEXT:    v_mul_lo_u32 v5, v3, s15
1038; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
1039; GCN-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
1040; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v5
1041; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v4
1042; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s11, v5
1043; GCN-NEXT:    v_add_i32_e32 v4, vcc, -1, v3
1044; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
1045; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1046; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
1047; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[2:3]
1048; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
1049; GCN-NEXT:    s_endpgm
1050  %r = udiv <4 x i32> %x, %y
1051  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1052  ret void
1053}
1054
1055define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1056; CHECK-LABEL: @urem_v4i32(
1057; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1058; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1059; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1060; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1061; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41F0000000000000
1062; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1063; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
1064; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP2]] to i64
1065; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
1066; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
1067; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
1068; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1069; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP10]]
1070; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
1071; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP10]]
1072; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
1073; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP6]] to i64
1074; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1075; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1076; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1077; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1078; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP6]], [[TMP21]]
1079; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP6]], [[TMP21]]
1080; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP14]], i32 [[TMP22]], i32 [[TMP23]]
1081; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
1082; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP1]] to i64
1083; CHECK-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP25]], [[TMP26]]
1084; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
1085; CHECK-NEXT:    [[TMP29:%.*]] = lshr i64 [[TMP27]], 32
1086; CHECK-NEXT:    [[TMP30:%.*]] = trunc i64 [[TMP29]] to i32
1087; CHECK-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], [[TMP2]]
1088; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP1]], [[TMP31]]
1089; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP2]]
1090; CHECK-NEXT:    [[TMP34:%.*]] = icmp uge i32 [[TMP1]], [[TMP31]]
1091; CHECK-NEXT:    [[TMP35:%.*]] = and i1 [[TMP33]], [[TMP34]]
1092; CHECK-NEXT:    [[TMP36:%.*]] = sub i32 [[TMP32]], [[TMP2]]
1093; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP2]]
1094; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP35]], i32 [[TMP36]], i32 [[TMP32]]
1095; CHECK-NEXT:    [[TMP39:%.*]] = select i1 [[TMP34]], i32 [[TMP38]], i32 [[TMP37]]
1096; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i32> undef, i32 [[TMP39]], i64 0
1097; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i32> [[X]], i64 1
1098; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1099; CHECK-NEXT:    [[TMP43:%.*]] = uitofp i32 [[TMP42]] to float
1100; CHECK-NEXT:    [[TMP44:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP43]])
1101; CHECK-NEXT:    [[TMP45:%.*]] = fmul fast float [[TMP44]], 0x41F0000000000000
1102; CHECK-NEXT:    [[TMP46:%.*]] = fptoui float [[TMP45]] to i32
1103; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP46]] to i64
1104; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP42]] to i64
1105; CHECK-NEXT:    [[TMP49:%.*]] = mul i64 [[TMP47]], [[TMP48]]
1106; CHECK-NEXT:    [[TMP50:%.*]] = trunc i64 [[TMP49]] to i32
1107; CHECK-NEXT:    [[TMP51:%.*]] = lshr i64 [[TMP49]], 32
1108; CHECK-NEXT:    [[TMP52:%.*]] = trunc i64 [[TMP51]] to i32
1109; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 0, [[TMP50]]
1110; CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i32 [[TMP52]], 0
1111; CHECK-NEXT:    [[TMP55:%.*]] = select i1 [[TMP54]], i32 [[TMP53]], i32 [[TMP50]]
1112; CHECK-NEXT:    [[TMP56:%.*]] = zext i32 [[TMP55]] to i64
1113; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP46]] to i64
1114; CHECK-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP56]], [[TMP57]]
1115; CHECK-NEXT:    [[TMP59:%.*]] = trunc i64 [[TMP58]] to i32
1116; CHECK-NEXT:    [[TMP60:%.*]] = lshr i64 [[TMP58]], 32
1117; CHECK-NEXT:    [[TMP61:%.*]] = trunc i64 [[TMP60]] to i32
1118; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP46]], [[TMP61]]
1119; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP46]], [[TMP61]]
1120; CHECK-NEXT:    [[TMP64:%.*]] = select i1 [[TMP54]], i32 [[TMP62]], i32 [[TMP63]]
1121; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP64]] to i64
1122; CHECK-NEXT:    [[TMP66:%.*]] = zext i32 [[TMP41]] to i64
1123; CHECK-NEXT:    [[TMP67:%.*]] = mul i64 [[TMP65]], [[TMP66]]
1124; CHECK-NEXT:    [[TMP68:%.*]] = trunc i64 [[TMP67]] to i32
1125; CHECK-NEXT:    [[TMP69:%.*]] = lshr i64 [[TMP67]], 32
1126; CHECK-NEXT:    [[TMP70:%.*]] = trunc i64 [[TMP69]] to i32
1127; CHECK-NEXT:    [[TMP71:%.*]] = mul i32 [[TMP70]], [[TMP42]]
1128; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP41]], [[TMP71]]
1129; CHECK-NEXT:    [[TMP73:%.*]] = icmp uge i32 [[TMP72]], [[TMP42]]
1130; CHECK-NEXT:    [[TMP74:%.*]] = icmp uge i32 [[TMP41]], [[TMP71]]
1131; CHECK-NEXT:    [[TMP75:%.*]] = and i1 [[TMP73]], [[TMP74]]
1132; CHECK-NEXT:    [[TMP76:%.*]] = sub i32 [[TMP72]], [[TMP42]]
1133; CHECK-NEXT:    [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP42]]
1134; CHECK-NEXT:    [[TMP78:%.*]] = select i1 [[TMP75]], i32 [[TMP76]], i32 [[TMP72]]
1135; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP74]], i32 [[TMP78]], i32 [[TMP77]]
1136; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP79]], i64 1
1137; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <4 x i32> [[X]], i64 2
1138; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1139; CHECK-NEXT:    [[TMP83:%.*]] = uitofp i32 [[TMP82]] to float
1140; CHECK-NEXT:    [[TMP84:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP83]])
1141; CHECK-NEXT:    [[TMP85:%.*]] = fmul fast float [[TMP84]], 0x41F0000000000000
1142; CHECK-NEXT:    [[TMP86:%.*]] = fptoui float [[TMP85]] to i32
1143; CHECK-NEXT:    [[TMP87:%.*]] = zext i32 [[TMP86]] to i64
1144; CHECK-NEXT:    [[TMP88:%.*]] = zext i32 [[TMP82]] to i64
1145; CHECK-NEXT:    [[TMP89:%.*]] = mul i64 [[TMP87]], [[TMP88]]
1146; CHECK-NEXT:    [[TMP90:%.*]] = trunc i64 [[TMP89]] to i32
1147; CHECK-NEXT:    [[TMP91:%.*]] = lshr i64 [[TMP89]], 32
1148; CHECK-NEXT:    [[TMP92:%.*]] = trunc i64 [[TMP91]] to i32
1149; CHECK-NEXT:    [[TMP93:%.*]] = sub i32 0, [[TMP90]]
1150; CHECK-NEXT:    [[TMP94:%.*]] = icmp eq i32 [[TMP92]], 0
1151; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP93]], i32 [[TMP90]]
1152; CHECK-NEXT:    [[TMP96:%.*]] = zext i32 [[TMP95]] to i64
1153; CHECK-NEXT:    [[TMP97:%.*]] = zext i32 [[TMP86]] to i64
1154; CHECK-NEXT:    [[TMP98:%.*]] = mul i64 [[TMP96]], [[TMP97]]
1155; CHECK-NEXT:    [[TMP99:%.*]] = trunc i64 [[TMP98]] to i32
1156; CHECK-NEXT:    [[TMP100:%.*]] = lshr i64 [[TMP98]], 32
1157; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
1158; CHECK-NEXT:    [[TMP102:%.*]] = add i32 [[TMP86]], [[TMP101]]
1159; CHECK-NEXT:    [[TMP103:%.*]] = sub i32 [[TMP86]], [[TMP101]]
1160; CHECK-NEXT:    [[TMP104:%.*]] = select i1 [[TMP94]], i32 [[TMP102]], i32 [[TMP103]]
1161; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP104]] to i64
1162; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP81]] to i64
1163; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1164; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1165; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1166; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1167; CHECK-NEXT:    [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP82]]
1168; CHECK-NEXT:    [[TMP112:%.*]] = sub i32 [[TMP81]], [[TMP111]]
1169; CHECK-NEXT:    [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP82]]
1170; CHECK-NEXT:    [[TMP114:%.*]] = icmp uge i32 [[TMP81]], [[TMP111]]
1171; CHECK-NEXT:    [[TMP115:%.*]] = and i1 [[TMP113]], [[TMP114]]
1172; CHECK-NEXT:    [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP82]]
1173; CHECK-NEXT:    [[TMP117:%.*]] = add i32 [[TMP112]], [[TMP82]]
1174; CHECK-NEXT:    [[TMP118:%.*]] = select i1 [[TMP115]], i32 [[TMP116]], i32 [[TMP112]]
1175; CHECK-NEXT:    [[TMP119:%.*]] = select i1 [[TMP114]], i32 [[TMP118]], i32 [[TMP117]]
1176; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <4 x i32> [[TMP80]], i32 [[TMP119]], i64 2
1177; CHECK-NEXT:    [[TMP121:%.*]] = extractelement <4 x i32> [[X]], i64 3
1178; CHECK-NEXT:    [[TMP122:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1179; CHECK-NEXT:    [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float
1180; CHECK-NEXT:    [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]])
1181; CHECK-NEXT:    [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41F0000000000000
1182; CHECK-NEXT:    [[TMP126:%.*]] = fptoui float [[TMP125]] to i32
1183; CHECK-NEXT:    [[TMP127:%.*]] = zext i32 [[TMP126]] to i64
1184; CHECK-NEXT:    [[TMP128:%.*]] = zext i32 [[TMP122]] to i64
1185; CHECK-NEXT:    [[TMP129:%.*]] = mul i64 [[TMP127]], [[TMP128]]
1186; CHECK-NEXT:    [[TMP130:%.*]] = trunc i64 [[TMP129]] to i32
1187; CHECK-NEXT:    [[TMP131:%.*]] = lshr i64 [[TMP129]], 32
1188; CHECK-NEXT:    [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32
1189; CHECK-NEXT:    [[TMP133:%.*]] = sub i32 0, [[TMP130]]
1190; CHECK-NEXT:    [[TMP134:%.*]] = icmp eq i32 [[TMP132]], 0
1191; CHECK-NEXT:    [[TMP135:%.*]] = select i1 [[TMP134]], i32 [[TMP133]], i32 [[TMP130]]
1192; CHECK-NEXT:    [[TMP136:%.*]] = zext i32 [[TMP135]] to i64
1193; CHECK-NEXT:    [[TMP137:%.*]] = zext i32 [[TMP126]] to i64
1194; CHECK-NEXT:    [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]]
1195; CHECK-NEXT:    [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32
1196; CHECK-NEXT:    [[TMP140:%.*]] = lshr i64 [[TMP138]], 32
1197; CHECK-NEXT:    [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32
1198; CHECK-NEXT:    [[TMP142:%.*]] = add i32 [[TMP126]], [[TMP141]]
1199; CHECK-NEXT:    [[TMP143:%.*]] = sub i32 [[TMP126]], [[TMP141]]
1200; CHECK-NEXT:    [[TMP144:%.*]] = select i1 [[TMP134]], i32 [[TMP142]], i32 [[TMP143]]
1201; CHECK-NEXT:    [[TMP145:%.*]] = zext i32 [[TMP144]] to i64
1202; CHECK-NEXT:    [[TMP146:%.*]] = zext i32 [[TMP121]] to i64
1203; CHECK-NEXT:    [[TMP147:%.*]] = mul i64 [[TMP145]], [[TMP146]]
1204; CHECK-NEXT:    [[TMP148:%.*]] = trunc i64 [[TMP147]] to i32
1205; CHECK-NEXT:    [[TMP149:%.*]] = lshr i64 [[TMP147]], 32
1206; CHECK-NEXT:    [[TMP150:%.*]] = trunc i64 [[TMP149]] to i32
1207; CHECK-NEXT:    [[TMP151:%.*]] = mul i32 [[TMP150]], [[TMP122]]
1208; CHECK-NEXT:    [[TMP152:%.*]] = sub i32 [[TMP121]], [[TMP151]]
1209; CHECK-NEXT:    [[TMP153:%.*]] = icmp uge i32 [[TMP152]], [[TMP122]]
1210; CHECK-NEXT:    [[TMP154:%.*]] = icmp uge i32 [[TMP121]], [[TMP151]]
1211; CHECK-NEXT:    [[TMP155:%.*]] = and i1 [[TMP153]], [[TMP154]]
1212; CHECK-NEXT:    [[TMP156:%.*]] = sub i32 [[TMP152]], [[TMP122]]
1213; CHECK-NEXT:    [[TMP157:%.*]] = add i32 [[TMP152]], [[TMP122]]
1214; CHECK-NEXT:    [[TMP158:%.*]] = select i1 [[TMP155]], i32 [[TMP156]], i32 [[TMP152]]
1215; CHECK-NEXT:    [[TMP159:%.*]] = select i1 [[TMP154]], i32 [[TMP158]], i32 [[TMP157]]
1216; CHECK-NEXT:    [[TMP160:%.*]] = insertelement <4 x i32> [[TMP120]], i32 [[TMP159]], i64 3
1217; CHECK-NEXT:    store <4 x i32> [[TMP160]], <4 x i32> addrspace(1)* [[OUT:%.*]]
1218; CHECK-NEXT:    ret void
1219;
1220; GCN-LABEL: urem_v4i32:
1221; GCN:       ; %bb.0:
1222; GCN-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0xd
1223; GCN-NEXT:    s_mov_b32 s6, 0x4f800000
1224; GCN-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x9
1225; GCN-NEXT:    s_mov_b32 s19, 0xf000
1226; GCN-NEXT:    s_mov_b32 s18, -1
1227; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1228; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
1229; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
1230; GCN-NEXT:    v_cvt_f32_u32_e32 v7, s15
1231; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1232; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1233; GCN-NEXT:    v_mul_f32_e32 v0, s6, v0
1234; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
1235; GCN-NEXT:    v_mul_f32_e32 v1, s6, v1
1236; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1237; GCN-NEXT:    v_mul_lo_u32 v2, v0, s12
1238; GCN-NEXT:    v_mul_hi_u32 v3, v0, s12
1239; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
1240; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
1241; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
1242; GCN-NEXT:    v_mul_hi_u32 v2, v2, v0
1243; GCN-NEXT:    v_mul_lo_u32 v3, v1, s13
1244; GCN-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
1245; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
1246; GCN-NEXT:    v_mul_hi_u32 v2, v1, s13
1247; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1248; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
1249; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
1250; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
1251; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
1252; GCN-NEXT:    v_mul_hi_u32 v2, v2, v1
1253; GCN-NEXT:    v_mul_lo_u32 v0, v0, s12
1254; GCN-NEXT:    v_add_i32_e32 v5, vcc, v2, v1
1255; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v2, v1
1256; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s14
1257; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
1258; GCN-NEXT:    v_mul_hi_u32 v1, v1, s9
1259; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v0
1260; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1261; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], s8, v0
1262; GCN-NEXT:    v_mul_lo_u32 v1, v1, s13
1263; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v3
1264; GCN-NEXT:    v_mul_f32_e32 v2, s6, v2
1265; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1266; GCN-NEXT:    v_add_i32_e32 v4, vcc, s12, v3
1267; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v3
1268; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[4:5]
1269; GCN-NEXT:    v_mul_lo_u32 v5, v2, s14
1270; GCN-NEXT:    v_mul_hi_u32 v6, v2, s14
1271; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
1272; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
1273; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s9, v1
1274; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s9, v1
1275; GCN-NEXT:    v_sub_i32_e32 v1, vcc, 0, v5
1276; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v6
1277; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
1278; GCN-NEXT:    v_mul_hi_u32 v1, v1, v2
1279; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v3
1280; GCN-NEXT:    v_add_i32_e32 v4, vcc, s13, v3
1281; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s13, v3
1282; GCN-NEXT:    v_add_i32_e32 v6, vcc, v1, v2
1283; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v1, v2
1284; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[4:5]
1285; GCN-NEXT:    v_mul_hi_u32 v1, v1, s10
1286; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v7
1287; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1288; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
1289; GCN-NEXT:    v_mul_lo_u32 v5, v1, s14
1290; GCN-NEXT:    v_mul_f32_e32 v1, s6, v2
1291; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
1292; GCN-NEXT:    v_cndmask_b32_e64 v1, v4, v3, s[2:3]
1293; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s10, v5
1294; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v3
1295; GCN-NEXT:    v_mul_lo_u32 v4, v2, s15
1296; GCN-NEXT:    v_mul_hi_u32 v6, v2, s15
1297; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 0, v4
1298; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v6
1299; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[2:3]
1300; GCN-NEXT:    v_mul_hi_u32 v4, v4, v2
1301; GCN-NEXT:    v_add_i32_e32 v6, vcc, s14, v3
1302; GCN-NEXT:    v_add_i32_e32 v7, vcc, v4, v2
1303; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v4, v2
1304; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[2:3]
1305; GCN-NEXT:    v_mul_hi_u32 v2, v2, s11
1306; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s10, v5
1307; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s14, v3
1308; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1309; GCN-NEXT:    v_mul_lo_u32 v5, v2, s15
1310; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
1311; GCN-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
1312; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s11, v5
1313; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s11, v5
1314; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v3
1315; GCN-NEXT:    v_add_i32_e32 v4, vcc, s15, v3
1316; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s15, v3
1317; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1318; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
1319; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[2:3]
1320; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
1321; GCN-NEXT:    s_endpgm
1322  %r = urem <4 x i32> %x, %y
1323  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1324  ret void
1325}
1326
1327define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1328; CHECK-LABEL: @sdiv_v4i32(
1329; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1330; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1331; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
1332; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
1333; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
1334; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
1335; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
1336; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
1337; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
1338; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
1339; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
1340; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41F0000000000000
1341; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
1342; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
1343; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP9]] to i64
1344; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
1345; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
1346; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
1347; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1348; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 0, [[TMP17]]
1349; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
1350; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 [[TMP17]]
1351; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP22]] to i64
1352; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP13]] to i64
1353; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
1354; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
1355; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
1356; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
1357; CHECK-NEXT:    [[TMP29:%.*]] = add i32 [[TMP13]], [[TMP28]]
1358; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP13]], [[TMP28]]
1359; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP21]], i32 [[TMP29]], i32 [[TMP30]]
1360; CHECK-NEXT:    [[TMP32:%.*]] = zext i32 [[TMP31]] to i64
1361; CHECK-NEXT:    [[TMP33:%.*]] = zext i32 [[TMP8]] to i64
1362; CHECK-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP32]], [[TMP33]]
1363; CHECK-NEXT:    [[TMP35:%.*]] = trunc i64 [[TMP34]] to i32
1364; CHECK-NEXT:    [[TMP36:%.*]] = lshr i64 [[TMP34]], 32
1365; CHECK-NEXT:    [[TMP37:%.*]] = trunc i64 [[TMP36]] to i32
1366; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP9]]
1367; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 [[TMP8]], [[TMP38]]
1368; CHECK-NEXT:    [[TMP40:%.*]] = icmp uge i32 [[TMP39]], [[TMP9]]
1369; CHECK-NEXT:    [[TMP41:%.*]] = icmp uge i32 [[TMP8]], [[TMP38]]
1370; CHECK-NEXT:    [[TMP42:%.*]] = and i1 [[TMP40]], [[TMP41]]
1371; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP37]], 1
1372; CHECK-NEXT:    [[TMP44:%.*]] = sub i32 [[TMP37]], 1
1373; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP42]], i32 [[TMP43]], i32 [[TMP37]]
1374; CHECK-NEXT:    [[TMP46:%.*]] = select i1 [[TMP41]], i32 [[TMP45]], i32 [[TMP44]]
1375; CHECK-NEXT:    [[TMP47:%.*]] = xor i32 [[TMP46]], [[TMP5]]
1376; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP47]], [[TMP5]]
1377; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <4 x i32> undef, i32 [[TMP48]], i64 0
1378; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i32> [[X]], i64 1
1379; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1380; CHECK-NEXT:    [[TMP52:%.*]] = ashr i32 [[TMP50]], 31
1381; CHECK-NEXT:    [[TMP53:%.*]] = ashr i32 [[TMP51]], 31
1382; CHECK-NEXT:    [[TMP54:%.*]] = xor i32 [[TMP52]], [[TMP53]]
1383; CHECK-NEXT:    [[TMP55:%.*]] = add i32 [[TMP50]], [[TMP52]]
1384; CHECK-NEXT:    [[TMP56:%.*]] = add i32 [[TMP51]], [[TMP53]]
1385; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP52]]
1386; CHECK-NEXT:    [[TMP58:%.*]] = xor i32 [[TMP56]], [[TMP53]]
1387; CHECK-NEXT:    [[TMP59:%.*]] = uitofp i32 [[TMP58]] to float
1388; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP59]])
1389; CHECK-NEXT:    [[TMP61:%.*]] = fmul fast float [[TMP60]], 0x41F0000000000000
1390; CHECK-NEXT:    [[TMP62:%.*]] = fptoui float [[TMP61]] to i32
1391; CHECK-NEXT:    [[TMP63:%.*]] = zext i32 [[TMP62]] to i64
1392; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP58]] to i64
1393; CHECK-NEXT:    [[TMP65:%.*]] = mul i64 [[TMP63]], [[TMP64]]
1394; CHECK-NEXT:    [[TMP66:%.*]] = trunc i64 [[TMP65]] to i32
1395; CHECK-NEXT:    [[TMP67:%.*]] = lshr i64 [[TMP65]], 32
1396; CHECK-NEXT:    [[TMP68:%.*]] = trunc i64 [[TMP67]] to i32
1397; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 0, [[TMP66]]
1398; CHECK-NEXT:    [[TMP70:%.*]] = icmp eq i32 [[TMP68]], 0
1399; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP69]], i32 [[TMP66]]
1400; CHECK-NEXT:    [[TMP72:%.*]] = zext i32 [[TMP71]] to i64
1401; CHECK-NEXT:    [[TMP73:%.*]] = zext i32 [[TMP62]] to i64
1402; CHECK-NEXT:    [[TMP74:%.*]] = mul i64 [[TMP72]], [[TMP73]]
1403; CHECK-NEXT:    [[TMP75:%.*]] = trunc i64 [[TMP74]] to i32
1404; CHECK-NEXT:    [[TMP76:%.*]] = lshr i64 [[TMP74]], 32
1405; CHECK-NEXT:    [[TMP77:%.*]] = trunc i64 [[TMP76]] to i32
1406; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP62]], [[TMP77]]
1407; CHECK-NEXT:    [[TMP79:%.*]] = sub i32 [[TMP62]], [[TMP77]]
1408; CHECK-NEXT:    [[TMP80:%.*]] = select i1 [[TMP70]], i32 [[TMP78]], i32 [[TMP79]]
1409; CHECK-NEXT:    [[TMP81:%.*]] = zext i32 [[TMP80]] to i64
1410; CHECK-NEXT:    [[TMP82:%.*]] = zext i32 [[TMP57]] to i64
1411; CHECK-NEXT:    [[TMP83:%.*]] = mul i64 [[TMP81]], [[TMP82]]
1412; CHECK-NEXT:    [[TMP84:%.*]] = trunc i64 [[TMP83]] to i32
1413; CHECK-NEXT:    [[TMP85:%.*]] = lshr i64 [[TMP83]], 32
1414; CHECK-NEXT:    [[TMP86:%.*]] = trunc i64 [[TMP85]] to i32
1415; CHECK-NEXT:    [[TMP87:%.*]] = mul i32 [[TMP86]], [[TMP58]]
1416; CHECK-NEXT:    [[TMP88:%.*]] = sub i32 [[TMP57]], [[TMP87]]
1417; CHECK-NEXT:    [[TMP89:%.*]] = icmp uge i32 [[TMP88]], [[TMP58]]
1418; CHECK-NEXT:    [[TMP90:%.*]] = icmp uge i32 [[TMP57]], [[TMP87]]
1419; CHECK-NEXT:    [[TMP91:%.*]] = and i1 [[TMP89]], [[TMP90]]
1420; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP86]], 1
1421; CHECK-NEXT:    [[TMP93:%.*]] = sub i32 [[TMP86]], 1
1422; CHECK-NEXT:    [[TMP94:%.*]] = select i1 [[TMP91]], i32 [[TMP92]], i32 [[TMP86]]
1423; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP90]], i32 [[TMP94]], i32 [[TMP93]]
1424; CHECK-NEXT:    [[TMP96:%.*]] = xor i32 [[TMP95]], [[TMP54]]
1425; CHECK-NEXT:    [[TMP97:%.*]] = sub i32 [[TMP96]], [[TMP54]]
1426; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <4 x i32> [[TMP49]], i32 [[TMP97]], i64 1
1427; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <4 x i32> [[X]], i64 2
1428; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1429; CHECK-NEXT:    [[TMP101:%.*]] = ashr i32 [[TMP99]], 31
1430; CHECK-NEXT:    [[TMP102:%.*]] = ashr i32 [[TMP100]], 31
1431; CHECK-NEXT:    [[TMP103:%.*]] = xor i32 [[TMP101]], [[TMP102]]
1432; CHECK-NEXT:    [[TMP104:%.*]] = add i32 [[TMP99]], [[TMP101]]
1433; CHECK-NEXT:    [[TMP105:%.*]] = add i32 [[TMP100]], [[TMP102]]
1434; CHECK-NEXT:    [[TMP106:%.*]] = xor i32 [[TMP104]], [[TMP101]]
1435; CHECK-NEXT:    [[TMP107:%.*]] = xor i32 [[TMP105]], [[TMP102]]
1436; CHECK-NEXT:    [[TMP108:%.*]] = uitofp i32 [[TMP107]] to float
1437; CHECK-NEXT:    [[TMP109:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP108]])
1438; CHECK-NEXT:    [[TMP110:%.*]] = fmul fast float [[TMP109]], 0x41F0000000000000
1439; CHECK-NEXT:    [[TMP111:%.*]] = fptoui float [[TMP110]] to i32
1440; CHECK-NEXT:    [[TMP112:%.*]] = zext i32 [[TMP111]] to i64
1441; CHECK-NEXT:    [[TMP113:%.*]] = zext i32 [[TMP107]] to i64
1442; CHECK-NEXT:    [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]]
1443; CHECK-NEXT:    [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32
1444; CHECK-NEXT:    [[TMP116:%.*]] = lshr i64 [[TMP114]], 32
1445; CHECK-NEXT:    [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32
1446; CHECK-NEXT:    [[TMP118:%.*]] = sub i32 0, [[TMP115]]
1447; CHECK-NEXT:    [[TMP119:%.*]] = icmp eq i32 [[TMP117]], 0
1448; CHECK-NEXT:    [[TMP120:%.*]] = select i1 [[TMP119]], i32 [[TMP118]], i32 [[TMP115]]
1449; CHECK-NEXT:    [[TMP121:%.*]] = zext i32 [[TMP120]] to i64
1450; CHECK-NEXT:    [[TMP122:%.*]] = zext i32 [[TMP111]] to i64
1451; CHECK-NEXT:    [[TMP123:%.*]] = mul i64 [[TMP121]], [[TMP122]]
1452; CHECK-NEXT:    [[TMP124:%.*]] = trunc i64 [[TMP123]] to i32
1453; CHECK-NEXT:    [[TMP125:%.*]] = lshr i64 [[TMP123]], 32
1454; CHECK-NEXT:    [[TMP126:%.*]] = trunc i64 [[TMP125]] to i32
1455; CHECK-NEXT:    [[TMP127:%.*]] = add i32 [[TMP111]], [[TMP126]]
1456; CHECK-NEXT:    [[TMP128:%.*]] = sub i32 [[TMP111]], [[TMP126]]
1457; CHECK-NEXT:    [[TMP129:%.*]] = select i1 [[TMP119]], i32 [[TMP127]], i32 [[TMP128]]
1458; CHECK-NEXT:    [[TMP130:%.*]] = zext i32 [[TMP129]] to i64
1459; CHECK-NEXT:    [[TMP131:%.*]] = zext i32 [[TMP106]] to i64
1460; CHECK-NEXT:    [[TMP132:%.*]] = mul i64 [[TMP130]], [[TMP131]]
1461; CHECK-NEXT:    [[TMP133:%.*]] = trunc i64 [[TMP132]] to i32
1462; CHECK-NEXT:    [[TMP134:%.*]] = lshr i64 [[TMP132]], 32
1463; CHECK-NEXT:    [[TMP135:%.*]] = trunc i64 [[TMP134]] to i32
1464; CHECK-NEXT:    [[TMP136:%.*]] = mul i32 [[TMP135]], [[TMP107]]
1465; CHECK-NEXT:    [[TMP137:%.*]] = sub i32 [[TMP106]], [[TMP136]]
1466; CHECK-NEXT:    [[TMP138:%.*]] = icmp uge i32 [[TMP137]], [[TMP107]]
1467; CHECK-NEXT:    [[TMP139:%.*]] = icmp uge i32 [[TMP106]], [[TMP136]]
1468; CHECK-NEXT:    [[TMP140:%.*]] = and i1 [[TMP138]], [[TMP139]]
1469; CHECK-NEXT:    [[TMP141:%.*]] = add i32 [[TMP135]], 1
1470; CHECK-NEXT:    [[TMP142:%.*]] = sub i32 [[TMP135]], 1
1471; CHECK-NEXT:    [[TMP143:%.*]] = select i1 [[TMP140]], i32 [[TMP141]], i32 [[TMP135]]
1472; CHECK-NEXT:    [[TMP144:%.*]] = select i1 [[TMP139]], i32 [[TMP143]], i32 [[TMP142]]
1473; CHECK-NEXT:    [[TMP145:%.*]] = xor i32 [[TMP144]], [[TMP103]]
1474; CHECK-NEXT:    [[TMP146:%.*]] = sub i32 [[TMP145]], [[TMP103]]
1475; CHECK-NEXT:    [[TMP147:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP146]], i64 2
1476; CHECK-NEXT:    [[TMP148:%.*]] = extractelement <4 x i32> [[X]], i64 3
1477; CHECK-NEXT:    [[TMP149:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1478; CHECK-NEXT:    [[TMP150:%.*]] = ashr i32 [[TMP148]], 31
1479; CHECK-NEXT:    [[TMP151:%.*]] = ashr i32 [[TMP149]], 31
1480; CHECK-NEXT:    [[TMP152:%.*]] = xor i32 [[TMP150]], [[TMP151]]
1481; CHECK-NEXT:    [[TMP153:%.*]] = add i32 [[TMP148]], [[TMP150]]
1482; CHECK-NEXT:    [[TMP154:%.*]] = add i32 [[TMP149]], [[TMP151]]
1483; CHECK-NEXT:    [[TMP155:%.*]] = xor i32 [[TMP153]], [[TMP150]]
1484; CHECK-NEXT:    [[TMP156:%.*]] = xor i32 [[TMP154]], [[TMP151]]
1485; CHECK-NEXT:    [[TMP157:%.*]] = uitofp i32 [[TMP156]] to float
1486; CHECK-NEXT:    [[TMP158:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP157]])
1487; CHECK-NEXT:    [[TMP159:%.*]] = fmul fast float [[TMP158]], 0x41F0000000000000
1488; CHECK-NEXT:    [[TMP160:%.*]] = fptoui float [[TMP159]] to i32
1489; CHECK-NEXT:    [[TMP161:%.*]] = zext i32 [[TMP160]] to i64
1490; CHECK-NEXT:    [[TMP162:%.*]] = zext i32 [[TMP156]] to i64
1491; CHECK-NEXT:    [[TMP163:%.*]] = mul i64 [[TMP161]], [[TMP162]]
1492; CHECK-NEXT:    [[TMP164:%.*]] = trunc i64 [[TMP163]] to i32
1493; CHECK-NEXT:    [[TMP165:%.*]] = lshr i64 [[TMP163]], 32
1494; CHECK-NEXT:    [[TMP166:%.*]] = trunc i64 [[TMP165]] to i32
1495; CHECK-NEXT:    [[TMP167:%.*]] = sub i32 0, [[TMP164]]
1496; CHECK-NEXT:    [[TMP168:%.*]] = icmp eq i32 [[TMP166]], 0
1497; CHECK-NEXT:    [[TMP169:%.*]] = select i1 [[TMP168]], i32 [[TMP167]], i32 [[TMP164]]
1498; CHECK-NEXT:    [[TMP170:%.*]] = zext i32 [[TMP169]] to i64
1499; CHECK-NEXT:    [[TMP171:%.*]] = zext i32 [[TMP160]] to i64
1500; CHECK-NEXT:    [[TMP172:%.*]] = mul i64 [[TMP170]], [[TMP171]]
1501; CHECK-NEXT:    [[TMP173:%.*]] = trunc i64 [[TMP172]] to i32
1502; CHECK-NEXT:    [[TMP174:%.*]] = lshr i64 [[TMP172]], 32
1503; CHECK-NEXT:    [[TMP175:%.*]] = trunc i64 [[TMP174]] to i32
1504; CHECK-NEXT:    [[TMP176:%.*]] = add i32 [[TMP160]], [[TMP175]]
1505; CHECK-NEXT:    [[TMP177:%.*]] = sub i32 [[TMP160]], [[TMP175]]
1506; CHECK-NEXT:    [[TMP178:%.*]] = select i1 [[TMP168]], i32 [[TMP176]], i32 [[TMP177]]
1507; CHECK-NEXT:    [[TMP179:%.*]] = zext i32 [[TMP178]] to i64
1508; CHECK-NEXT:    [[TMP180:%.*]] = zext i32 [[TMP155]] to i64
1509; CHECK-NEXT:    [[TMP181:%.*]] = mul i64 [[TMP179]], [[TMP180]]
1510; CHECK-NEXT:    [[TMP182:%.*]] = trunc i64 [[TMP181]] to i32
1511; CHECK-NEXT:    [[TMP183:%.*]] = lshr i64 [[TMP181]], 32
1512; CHECK-NEXT:    [[TMP184:%.*]] = trunc i64 [[TMP183]] to i32
1513; CHECK-NEXT:    [[TMP185:%.*]] = mul i32 [[TMP184]], [[TMP156]]
1514; CHECK-NEXT:    [[TMP186:%.*]] = sub i32 [[TMP155]], [[TMP185]]
1515; CHECK-NEXT:    [[TMP187:%.*]] = icmp uge i32 [[TMP186]], [[TMP156]]
1516; CHECK-NEXT:    [[TMP188:%.*]] = icmp uge i32 [[TMP155]], [[TMP185]]
1517; CHECK-NEXT:    [[TMP189:%.*]] = and i1 [[TMP187]], [[TMP188]]
1518; CHECK-NEXT:    [[TMP190:%.*]] = add i32 [[TMP184]], 1
1519; CHECK-NEXT:    [[TMP191:%.*]] = sub i32 [[TMP184]], 1
1520; CHECK-NEXT:    [[TMP192:%.*]] = select i1 [[TMP189]], i32 [[TMP190]], i32 [[TMP184]]
1521; CHECK-NEXT:    [[TMP193:%.*]] = select i1 [[TMP188]], i32 [[TMP192]], i32 [[TMP191]]
1522; CHECK-NEXT:    [[TMP194:%.*]] = xor i32 [[TMP193]], [[TMP152]]
1523; CHECK-NEXT:    [[TMP195:%.*]] = sub i32 [[TMP194]], [[TMP152]]
1524; CHECK-NEXT:    [[TMP196:%.*]] = insertelement <4 x i32> [[TMP147]], i32 [[TMP195]], i64 3
1525; CHECK-NEXT:    store <4 x i32> [[TMP196]], <4 x i32> addrspace(1)* [[OUT:%.*]]
1526; CHECK-NEXT:    ret void
1527;
1528; GCN-LABEL: sdiv_v4i32:
1529; GCN:       ; %bb.0:
1530; GCN-NEXT:    s_load_dwordx8 s[12:19], s[0:1], 0xd
1531; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
1532; GCN-NEXT:    s_mov_b32 s11, 0xf000
1533; GCN-NEXT:    s_mov_b32 s10, -1
1534; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1535; GCN-NEXT:    s_ashr_i32 s2, s16, 31
1536; GCN-NEXT:    s_add_i32 s3, s16, s2
1537; GCN-NEXT:    s_xor_b32 s5, s3, s2
1538; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
1539; GCN-NEXT:    s_mov_b32 s16, 0x4f800000
1540; GCN-NEXT:    s_ashr_i32 s6, s17, 31
1541; GCN-NEXT:    s_add_i32 s0, s17, s6
1542; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1543; GCN-NEXT:    s_xor_b32 s17, s0, s6
1544; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s17
1545; GCN-NEXT:    s_ashr_i32 s3, s12, 31
1546; GCN-NEXT:    v_mul_f32_e32 v0, s16, v0
1547; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
1548; GCN-NEXT:    s_add_i32 s4, s12, s3
1549; GCN-NEXT:    s_xor_b32 s4, s4, s3
1550; GCN-NEXT:    s_xor_b32 s7, s3, s2
1551; GCN-NEXT:    v_mul_lo_u32 v1, v0, s5
1552; GCN-NEXT:    v_mul_hi_u32 v2, v0, s5
1553; GCN-NEXT:    s_ashr_i32 s12, s13, 31
1554; GCN-NEXT:    s_add_i32 s13, s13, s12
1555; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
1556; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
1557; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1558; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
1559; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v3
1560; GCN-NEXT:    s_xor_b32 s13, s13, s12
1561; GCN-NEXT:    v_add_i32_e32 v3, vcc, v1, v0
1562; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
1563; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
1564; GCN-NEXT:    v_mul_hi_u32 v0, v0, s4
1565; GCN-NEXT:    v_mul_f32_e32 v1, s16, v2
1566; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1567; GCN-NEXT:    v_mul_lo_u32 v2, v0, s5
1568; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
1569; GCN-NEXT:    v_mul_hi_u32 v5, v1, s17
1570; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s4, v2
1571; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v4
1572; GCN-NEXT:    v_mul_lo_u32 v4, v1, s17
1573; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s4, v2
1574; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
1575; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
1576; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v4
1577; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[4:5]
1578; GCN-NEXT:    v_mul_hi_u32 v4, v4, v1
1579; GCN-NEXT:    v_add_i32_e32 v5, vcc, v4, v1
1580; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v4, v1
1581; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
1582; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
1583; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1584; GCN-NEXT:    s_ashr_i32 s5, s18, 31
1585; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
1586; GCN-NEXT:    s_add_i32 s0, s18, s5
1587; GCN-NEXT:    s_xor_b32 s4, s12, s6
1588; GCN-NEXT:    s_xor_b32 s12, s0, s5
1589; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s12
1590; GCN-NEXT:    v_mul_hi_u32 v1, v1, s13
1591; GCN-NEXT:    v_xor_b32_e32 v0, s7, v0
1592; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s7, v0
1593; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
1594; GCN-NEXT:    v_mul_lo_u32 v2, v1, s17
1595; GCN-NEXT:    s_ashr_i32 s6, s19, 31
1596; GCN-NEXT:    v_mul_f32_e32 v4, s16, v4
1597; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s13, v2
1598; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
1599; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s17, v3
1600; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s13, v2
1601; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v1
1602; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
1603; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1604; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1605; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[2:3]
1606; GCN-NEXT:    v_mul_lo_u32 v2, v4, s12
1607; GCN-NEXT:    v_mul_hi_u32 v3, v4, s12
1608; GCN-NEXT:    s_ashr_i32 s2, s14, 31
1609; GCN-NEXT:    s_add_i32 s3, s14, s2
1610; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
1611; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
1612; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
1613; GCN-NEXT:    v_mul_hi_u32 v2, v2, v4
1614; GCN-NEXT:    s_xor_b32 s3, s3, s2
1615; GCN-NEXT:    v_xor_b32_e32 v1, s4, v1
1616; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v1
1617; GCN-NEXT:    v_add_i32_e32 v3, vcc, v2, v4
1618; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v2, v4
1619; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
1620; GCN-NEXT:    s_add_i32 s0, s19, s6
1621; GCN-NEXT:    s_xor_b32 s14, s0, s6
1622; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s14
1623; GCN-NEXT:    v_mul_hi_u32 v2, v2, s3
1624; GCN-NEXT:    s_xor_b32 s7, s2, s5
1625; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
1626; GCN-NEXT:    v_mul_lo_u32 v3, v2, s12
1627; GCN-NEXT:    v_mul_f32_e32 v4, s16, v4
1628; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
1629; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s3, v3
1630; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v5
1631; GCN-NEXT:    s_ashr_i32 s12, s15, 31
1632; GCN-NEXT:    v_mul_lo_u32 v6, v4, s14
1633; GCN-NEXT:    v_mul_hi_u32 v7, v4, s14
1634; GCN-NEXT:    s_add_i32 s13, s15, s12
1635; GCN-NEXT:    s_xor_b32 s13, s13, s12
1636; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 0, v6
1637; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v7
1638; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[4:5]
1639; GCN-NEXT:    v_mul_hi_u32 v6, v6, v4
1640; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s3, v3
1641; GCN-NEXT:    v_add_i32_e32 v5, vcc, -1, v2
1642; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
1643; GCN-NEXT:    v_add_i32_e32 v7, vcc, v6, v4
1644; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v6, v4
1645; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
1646; GCN-NEXT:    v_mul_hi_u32 v4, v4, s13
1647; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1648; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
1649; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
1650; GCN-NEXT:    v_mul_lo_u32 v3, v4, s14
1651; GCN-NEXT:    v_xor_b32_e32 v2, s7, v2
1652; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v2
1653; GCN-NEXT:    s_xor_b32 s4, s12, s6
1654; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s13, v3
1655; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v5
1656; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s13, v3
1657; GCN-NEXT:    v_add_i32_e32 v5, vcc, -1, v4
1658; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v4
1659; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1660; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
1661; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[2:3]
1662; GCN-NEXT:    v_xor_b32_e32 v3, s4, v3
1663; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v3
1664; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
1665; GCN-NEXT:    s_endpgm
1666  %r = sdiv <4 x i32> %x, %y
1667  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1668  ret void
1669}
1670
1671define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1672; CHECK-LABEL: @srem_v4i32(
1673; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1674; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1675; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
1676; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
1677; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
1678; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
1679; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
1680; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
1681; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
1682; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
1683; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41F0000000000000
1684; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
1685; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP12]] to i64
1686; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP8]] to i64
1687; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]]
1688; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
1689; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP15]], 32
1690; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
1691; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 0, [[TMP16]]
1692; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], 0
1693; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP19]], i32 [[TMP16]]
1694; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP21]] to i64
1695; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP12]] to i64
1696; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
1697; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
1698; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
1699; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
1700; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP12]], [[TMP27]]
1701; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP12]], [[TMP27]]
1702; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP20]], i32 [[TMP28]], i32 [[TMP29]]
1703; CHECK-NEXT:    [[TMP31:%.*]] = zext i32 [[TMP30]] to i64
1704; CHECK-NEXT:    [[TMP32:%.*]] = zext i32 [[TMP7]] to i64
1705; CHECK-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP31]], [[TMP32]]
1706; CHECK-NEXT:    [[TMP34:%.*]] = trunc i64 [[TMP33]] to i32
1707; CHECK-NEXT:    [[TMP35:%.*]] = lshr i64 [[TMP33]], 32
1708; CHECK-NEXT:    [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
1709; CHECK-NEXT:    [[TMP37:%.*]] = mul i32 [[TMP36]], [[TMP8]]
1710; CHECK-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP7]], [[TMP37]]
1711; CHECK-NEXT:    [[TMP39:%.*]] = icmp uge i32 [[TMP38]], [[TMP8]]
1712; CHECK-NEXT:    [[TMP40:%.*]] = icmp uge i32 [[TMP7]], [[TMP37]]
1713; CHECK-NEXT:    [[TMP41:%.*]] = and i1 [[TMP39]], [[TMP40]]
1714; CHECK-NEXT:    [[TMP42:%.*]] = sub i32 [[TMP38]], [[TMP8]]
1715; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP38]], [[TMP8]]
1716; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP41]], i32 [[TMP42]], i32 [[TMP38]]
1717; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP40]], i32 [[TMP44]], i32 [[TMP43]]
1718; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP45]], [[TMP3]]
1719; CHECK-NEXT:    [[TMP47:%.*]] = sub i32 [[TMP46]], [[TMP3]]
1720; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i32> undef, i32 [[TMP47]], i64 0
1721; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[X]], i64 1
1722; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1723; CHECK-NEXT:    [[TMP51:%.*]] = ashr i32 [[TMP49]], 31
1724; CHECK-NEXT:    [[TMP52:%.*]] = ashr i32 [[TMP50]], 31
1725; CHECK-NEXT:    [[TMP53:%.*]] = add i32 [[TMP49]], [[TMP51]]
1726; CHECK-NEXT:    [[TMP54:%.*]] = add i32 [[TMP50]], [[TMP52]]
1727; CHECK-NEXT:    [[TMP55:%.*]] = xor i32 [[TMP53]], [[TMP51]]
1728; CHECK-NEXT:    [[TMP56:%.*]] = xor i32 [[TMP54]], [[TMP52]]
1729; CHECK-NEXT:    [[TMP57:%.*]] = uitofp i32 [[TMP56]] to float
1730; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
1731; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP58]], 0x41F0000000000000
1732; CHECK-NEXT:    [[TMP60:%.*]] = fptoui float [[TMP59]] to i32
1733; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP60]] to i64
1734; CHECK-NEXT:    [[TMP62:%.*]] = zext i32 [[TMP56]] to i64
1735; CHECK-NEXT:    [[TMP63:%.*]] = mul i64 [[TMP61]], [[TMP62]]
1736; CHECK-NEXT:    [[TMP64:%.*]] = trunc i64 [[TMP63]] to i32
1737; CHECK-NEXT:    [[TMP65:%.*]] = lshr i64 [[TMP63]], 32
1738; CHECK-NEXT:    [[TMP66:%.*]] = trunc i64 [[TMP65]] to i32
1739; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 0, [[TMP64]]
1740; CHECK-NEXT:    [[TMP68:%.*]] = icmp eq i32 [[TMP66]], 0
1741; CHECK-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], i32 [[TMP67]], i32 [[TMP64]]
1742; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i64
1743; CHECK-NEXT:    [[TMP71:%.*]] = zext i32 [[TMP60]] to i64
1744; CHECK-NEXT:    [[TMP72:%.*]] = mul i64 [[TMP70]], [[TMP71]]
1745; CHECK-NEXT:    [[TMP73:%.*]] = trunc i64 [[TMP72]] to i32
1746; CHECK-NEXT:    [[TMP74:%.*]] = lshr i64 [[TMP72]], 32
1747; CHECK-NEXT:    [[TMP75:%.*]] = trunc i64 [[TMP74]] to i32
1748; CHECK-NEXT:    [[TMP76:%.*]] = add i32 [[TMP60]], [[TMP75]]
1749; CHECK-NEXT:    [[TMP77:%.*]] = sub i32 [[TMP60]], [[TMP75]]
1750; CHECK-NEXT:    [[TMP78:%.*]] = select i1 [[TMP68]], i32 [[TMP76]], i32 [[TMP77]]
1751; CHECK-NEXT:    [[TMP79:%.*]] = zext i32 [[TMP78]] to i64
1752; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP55]] to i64
1753; CHECK-NEXT:    [[TMP81:%.*]] = mul i64 [[TMP79]], [[TMP80]]
1754; CHECK-NEXT:    [[TMP82:%.*]] = trunc i64 [[TMP81]] to i32
1755; CHECK-NEXT:    [[TMP83:%.*]] = lshr i64 [[TMP81]], 32
1756; CHECK-NEXT:    [[TMP84:%.*]] = trunc i64 [[TMP83]] to i32
1757; CHECK-NEXT:    [[TMP85:%.*]] = mul i32 [[TMP84]], [[TMP56]]
1758; CHECK-NEXT:    [[TMP86:%.*]] = sub i32 [[TMP55]], [[TMP85]]
1759; CHECK-NEXT:    [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP56]]
1760; CHECK-NEXT:    [[TMP88:%.*]] = icmp uge i32 [[TMP55]], [[TMP85]]
1761; CHECK-NEXT:    [[TMP89:%.*]] = and i1 [[TMP87]], [[TMP88]]
1762; CHECK-NEXT:    [[TMP90:%.*]] = sub i32 [[TMP86]], [[TMP56]]
1763; CHECK-NEXT:    [[TMP91:%.*]] = add i32 [[TMP86]], [[TMP56]]
1764; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP89]], i32 [[TMP90]], i32 [[TMP86]]
1765; CHECK-NEXT:    [[TMP93:%.*]] = select i1 [[TMP88]], i32 [[TMP92]], i32 [[TMP91]]
1766; CHECK-NEXT:    [[TMP94:%.*]] = xor i32 [[TMP93]], [[TMP51]]
1767; CHECK-NEXT:    [[TMP95:%.*]] = sub i32 [[TMP94]], [[TMP51]]
1768; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i32> [[TMP48]], i32 [[TMP95]], i64 1
1769; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 2
1770; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1771; CHECK-NEXT:    [[TMP99:%.*]] = ashr i32 [[TMP97]], 31
1772; CHECK-NEXT:    [[TMP100:%.*]] = ashr i32 [[TMP98]], 31
1773; CHECK-NEXT:    [[TMP101:%.*]] = add i32 [[TMP97]], [[TMP99]]
1774; CHECK-NEXT:    [[TMP102:%.*]] = add i32 [[TMP98]], [[TMP100]]
1775; CHECK-NEXT:    [[TMP103:%.*]] = xor i32 [[TMP101]], [[TMP99]]
1776; CHECK-NEXT:    [[TMP104:%.*]] = xor i32 [[TMP102]], [[TMP100]]
1777; CHECK-NEXT:    [[TMP105:%.*]] = uitofp i32 [[TMP104]] to float
1778; CHECK-NEXT:    [[TMP106:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP105]])
1779; CHECK-NEXT:    [[TMP107:%.*]] = fmul fast float [[TMP106]], 0x41F0000000000000
1780; CHECK-NEXT:    [[TMP108:%.*]] = fptoui float [[TMP107]] to i32
1781; CHECK-NEXT:    [[TMP109:%.*]] = zext i32 [[TMP108]] to i64
1782; CHECK-NEXT:    [[TMP110:%.*]] = zext i32 [[TMP104]] to i64
1783; CHECK-NEXT:    [[TMP111:%.*]] = mul i64 [[TMP109]], [[TMP110]]
1784; CHECK-NEXT:    [[TMP112:%.*]] = trunc i64 [[TMP111]] to i32
1785; CHECK-NEXT:    [[TMP113:%.*]] = lshr i64 [[TMP111]], 32
1786; CHECK-NEXT:    [[TMP114:%.*]] = trunc i64 [[TMP113]] to i32
1787; CHECK-NEXT:    [[TMP115:%.*]] = sub i32 0, [[TMP112]]
1788; CHECK-NEXT:    [[TMP116:%.*]] = icmp eq i32 [[TMP114]], 0
1789; CHECK-NEXT:    [[TMP117:%.*]] = select i1 [[TMP116]], i32 [[TMP115]], i32 [[TMP112]]
1790; CHECK-NEXT:    [[TMP118:%.*]] = zext i32 [[TMP117]] to i64
1791; CHECK-NEXT:    [[TMP119:%.*]] = zext i32 [[TMP108]] to i64
1792; CHECK-NEXT:    [[TMP120:%.*]] = mul i64 [[TMP118]], [[TMP119]]
1793; CHECK-NEXT:    [[TMP121:%.*]] = trunc i64 [[TMP120]] to i32
1794; CHECK-NEXT:    [[TMP122:%.*]] = lshr i64 [[TMP120]], 32
1795; CHECK-NEXT:    [[TMP123:%.*]] = trunc i64 [[TMP122]] to i32
1796; CHECK-NEXT:    [[TMP124:%.*]] = add i32 [[TMP108]], [[TMP123]]
1797; CHECK-NEXT:    [[TMP125:%.*]] = sub i32 [[TMP108]], [[TMP123]]
1798; CHECK-NEXT:    [[TMP126:%.*]] = select i1 [[TMP116]], i32 [[TMP124]], i32 [[TMP125]]
1799; CHECK-NEXT:    [[TMP127:%.*]] = zext i32 [[TMP126]] to i64
1800; CHECK-NEXT:    [[TMP128:%.*]] = zext i32 [[TMP103]] to i64
1801; CHECK-NEXT:    [[TMP129:%.*]] = mul i64 [[TMP127]], [[TMP128]]
1802; CHECK-NEXT:    [[TMP130:%.*]] = trunc i64 [[TMP129]] to i32
1803; CHECK-NEXT:    [[TMP131:%.*]] = lshr i64 [[TMP129]], 32
1804; CHECK-NEXT:    [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32
1805; CHECK-NEXT:    [[TMP133:%.*]] = mul i32 [[TMP132]], [[TMP104]]
1806; CHECK-NEXT:    [[TMP134:%.*]] = sub i32 [[TMP103]], [[TMP133]]
1807; CHECK-NEXT:    [[TMP135:%.*]] = icmp uge i32 [[TMP134]], [[TMP104]]
1808; CHECK-NEXT:    [[TMP136:%.*]] = icmp uge i32 [[TMP103]], [[TMP133]]
1809; CHECK-NEXT:    [[TMP137:%.*]] = and i1 [[TMP135]], [[TMP136]]
1810; CHECK-NEXT:    [[TMP138:%.*]] = sub i32 [[TMP134]], [[TMP104]]
1811; CHECK-NEXT:    [[TMP139:%.*]] = add i32 [[TMP134]], [[TMP104]]
1812; CHECK-NEXT:    [[TMP140:%.*]] = select i1 [[TMP137]], i32 [[TMP138]], i32 [[TMP134]]
1813; CHECK-NEXT:    [[TMP141:%.*]] = select i1 [[TMP136]], i32 [[TMP140]], i32 [[TMP139]]
1814; CHECK-NEXT:    [[TMP142:%.*]] = xor i32 [[TMP141]], [[TMP99]]
1815; CHECK-NEXT:    [[TMP143:%.*]] = sub i32 [[TMP142]], [[TMP99]]
1816; CHECK-NEXT:    [[TMP144:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP143]], i64 2
1817; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <4 x i32> [[X]], i64 3
1818; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1819; CHECK-NEXT:    [[TMP147:%.*]] = ashr i32 [[TMP145]], 31
1820; CHECK-NEXT:    [[TMP148:%.*]] = ashr i32 [[TMP146]], 31
1821; CHECK-NEXT:    [[TMP149:%.*]] = add i32 [[TMP145]], [[TMP147]]
1822; CHECK-NEXT:    [[TMP150:%.*]] = add i32 [[TMP146]], [[TMP148]]
1823; CHECK-NEXT:    [[TMP151:%.*]] = xor i32 [[TMP149]], [[TMP147]]
1824; CHECK-NEXT:    [[TMP152:%.*]] = xor i32 [[TMP150]], [[TMP148]]
1825; CHECK-NEXT:    [[TMP153:%.*]] = uitofp i32 [[TMP152]] to float
1826; CHECK-NEXT:    [[TMP154:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP153]])
1827; CHECK-NEXT:    [[TMP155:%.*]] = fmul fast float [[TMP154]], 0x41F0000000000000
1828; CHECK-NEXT:    [[TMP156:%.*]] = fptoui float [[TMP155]] to i32
1829; CHECK-NEXT:    [[TMP157:%.*]] = zext i32 [[TMP156]] to i64
1830; CHECK-NEXT:    [[TMP158:%.*]] = zext i32 [[TMP152]] to i64
1831; CHECK-NEXT:    [[TMP159:%.*]] = mul i64 [[TMP157]], [[TMP158]]
1832; CHECK-NEXT:    [[TMP160:%.*]] = trunc i64 [[TMP159]] to i32
1833; CHECK-NEXT:    [[TMP161:%.*]] = lshr i64 [[TMP159]], 32
1834; CHECK-NEXT:    [[TMP162:%.*]] = trunc i64 [[TMP161]] to i32
1835; CHECK-NEXT:    [[TMP163:%.*]] = sub i32 0, [[TMP160]]
1836; CHECK-NEXT:    [[TMP164:%.*]] = icmp eq i32 [[TMP162]], 0
1837; CHECK-NEXT:    [[TMP165:%.*]] = select i1 [[TMP164]], i32 [[TMP163]], i32 [[TMP160]]
1838; CHECK-NEXT:    [[TMP166:%.*]] = zext i32 [[TMP165]] to i64
1839; CHECK-NEXT:    [[TMP167:%.*]] = zext i32 [[TMP156]] to i64
1840; CHECK-NEXT:    [[TMP168:%.*]] = mul i64 [[TMP166]], [[TMP167]]
1841; CHECK-NEXT:    [[TMP169:%.*]] = trunc i64 [[TMP168]] to i32
1842; CHECK-NEXT:    [[TMP170:%.*]] = lshr i64 [[TMP168]], 32
1843; CHECK-NEXT:    [[TMP171:%.*]] = trunc i64 [[TMP170]] to i32
1844; CHECK-NEXT:    [[TMP172:%.*]] = add i32 [[TMP156]], [[TMP171]]
1845; CHECK-NEXT:    [[TMP173:%.*]] = sub i32 [[TMP156]], [[TMP171]]
1846; CHECK-NEXT:    [[TMP174:%.*]] = select i1 [[TMP164]], i32 [[TMP172]], i32 [[TMP173]]
1847; CHECK-NEXT:    [[TMP175:%.*]] = zext i32 [[TMP174]] to i64
1848; CHECK-NEXT:    [[TMP176:%.*]] = zext i32 [[TMP151]] to i64
1849; CHECK-NEXT:    [[TMP177:%.*]] = mul i64 [[TMP175]], [[TMP176]]
1850; CHECK-NEXT:    [[TMP178:%.*]] = trunc i64 [[TMP177]] to i32
1851; CHECK-NEXT:    [[TMP179:%.*]] = lshr i64 [[TMP177]], 32
1852; CHECK-NEXT:    [[TMP180:%.*]] = trunc i64 [[TMP179]] to i32
1853; CHECK-NEXT:    [[TMP181:%.*]] = mul i32 [[TMP180]], [[TMP152]]
1854; CHECK-NEXT:    [[TMP182:%.*]] = sub i32 [[TMP151]], [[TMP181]]
1855; CHECK-NEXT:    [[TMP183:%.*]] = icmp uge i32 [[TMP182]], [[TMP152]]
1856; CHECK-NEXT:    [[TMP184:%.*]] = icmp uge i32 [[TMP151]], [[TMP181]]
1857; CHECK-NEXT:    [[TMP185:%.*]] = and i1 [[TMP183]], [[TMP184]]
1858; CHECK-NEXT:    [[TMP186:%.*]] = sub i32 [[TMP182]], [[TMP152]]
1859; CHECK-NEXT:    [[TMP187:%.*]] = add i32 [[TMP182]], [[TMP152]]
1860; CHECK-NEXT:    [[TMP188:%.*]] = select i1 [[TMP185]], i32 [[TMP186]], i32 [[TMP182]]
1861; CHECK-NEXT:    [[TMP189:%.*]] = select i1 [[TMP184]], i32 [[TMP188]], i32 [[TMP187]]
1862; CHECK-NEXT:    [[TMP190:%.*]] = xor i32 [[TMP189]], [[TMP147]]
1863; CHECK-NEXT:    [[TMP191:%.*]] = sub i32 [[TMP190]], [[TMP147]]
1864; CHECK-NEXT:    [[TMP192:%.*]] = insertelement <4 x i32> [[TMP144]], i32 [[TMP191]], i64 3
1865; CHECK-NEXT:    store <4 x i32> [[TMP192]], <4 x i32> addrspace(1)* [[OUT:%.*]]
1866; CHECK-NEXT:    ret void
1867;
1868; GCN-LABEL: srem_v4i32:
1869; GCN:       ; %bb.0:
1870; GCN-NEXT:    s_load_dwordx8 s[12:19], s[0:1], 0xd
1871; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
1872; GCN-NEXT:    s_mov_b32 s11, 0xf000
1873; GCN-NEXT:    s_mov_b32 s10, -1
1874; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1875; GCN-NEXT:    s_ashr_i32 s2, s16, 31
1876; GCN-NEXT:    s_add_i32 s3, s16, s2
1877; GCN-NEXT:    s_xor_b32 s5, s3, s2
1878; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
1879; GCN-NEXT:    s_mov_b32 s16, 0x4f800000
1880; GCN-NEXT:    s_ashr_i32 s6, s12, 31
1881; GCN-NEXT:    s_ashr_i32 s2, s17, 31
1882; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1883; GCN-NEXT:    s_add_i32 s0, s12, s6
1884; GCN-NEXT:    s_add_i32 s3, s17, s2
1885; GCN-NEXT:    s_xor_b32 s4, s0, s6
1886; GCN-NEXT:    v_mul_f32_e32 v0, s16, v0
1887; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
1888; GCN-NEXT:    s_xor_b32 s17, s3, s2
1889; GCN-NEXT:    s_ashr_i32 s7, s13, 31
1890; GCN-NEXT:    s_add_i32 s12, s13, s7
1891; GCN-NEXT:    v_mul_lo_u32 v1, v0, s5
1892; GCN-NEXT:    v_mul_hi_u32 v2, v0, s5
1893; GCN-NEXT:    s_xor_b32 s12, s12, s7
1894; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
1895; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
1896; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
1897; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
1898; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s17
1899; GCN-NEXT:    v_add_i32_e32 v3, vcc, v1, v0
1900; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
1901; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v2
1902; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
1903; GCN-NEXT:    v_mul_hi_u32 v0, v0, s4
1904; GCN-NEXT:    v_mul_f32_e32 v1, s16, v1
1905; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1906; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
1907; GCN-NEXT:    v_mul_lo_u32 v4, v1, s17
1908; GCN-NEXT:    v_mul_hi_u32 v5, v1, s17
1909; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s4, v0
1910; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s4, v0
1911; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v2
1912; GCN-NEXT:    v_add_i32_e32 v3, vcc, s5, v2
1913; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v2
1914; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v4
1915; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
1916; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[4:5]
1917; GCN-NEXT:    v_mul_hi_u32 v4, v4, v1
1918; GCN-NEXT:    v_add_i32_e32 v5, vcc, v4, v1
1919; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v4, v1
1920; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1921; GCN-NEXT:    s_ashr_i32 s0, s18, 31
1922; GCN-NEXT:    s_add_i32 s1, s18, s0
1923; GCN-NEXT:    s_xor_b32 s13, s1, s0
1924; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
1925; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s13
1926; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
1927; GCN-NEXT:    v_mul_hi_u32 v1, v1, s12
1928; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[2:3]
1929; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1930; GCN-NEXT:    v_xor_b32_e32 v0, s6, v0
1931; GCN-NEXT:    v_mul_lo_u32 v1, v1, s17
1932; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
1933; GCN-NEXT:    v_mul_f32_e32 v2, s16, v2
1934; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1935; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s12, v1
1936; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s12, v1
1937; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s17, v3
1938; GCN-NEXT:    v_mul_lo_u32 v5, v2, s13
1939; GCN-NEXT:    v_mul_hi_u32 v6, v2, s13
1940; GCN-NEXT:    v_add_i32_e32 v4, vcc, s17, v3
1941; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s17, v3
1942; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 0, v5
1943; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v6
1944; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[4:5]
1945; GCN-NEXT:    v_mul_hi_u32 v5, v5, v2
1946; GCN-NEXT:    s_ashr_i32 s6, s14, 31
1947; GCN-NEXT:    s_add_i32 s12, s14, s6
1948; GCN-NEXT:    s_xor_b32 s12, s12, s6
1949; GCN-NEXT:    v_add_i32_e32 v6, vcc, v5, v2
1950; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v5, v2
1951; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1952; GCN-NEXT:    s_ashr_i32 s0, s19, 31
1953; GCN-NEXT:    s_add_i32 s1, s19, s0
1954; GCN-NEXT:    s_xor_b32 s14, s1, s0
1955; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
1956; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s14
1957; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
1958; GCN-NEXT:    v_mul_hi_u32 v2, v2, s12
1959; GCN-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[2:3]
1960; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1961; GCN-NEXT:    v_xor_b32_e32 v1, s7, v1
1962; GCN-NEXT:    v_mul_lo_u32 v2, v2, s13
1963; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s7, v1
1964; GCN-NEXT:    v_mul_f32_e32 v3, s16, v3
1965; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1966; GCN-NEXT:    s_ashr_i32 s7, s15, 31
1967; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s12, v2
1968; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s12, v2
1969; GCN-NEXT:    v_mul_lo_u32 v6, v3, s14
1970; GCN-NEXT:    v_mul_hi_u32 v7, v3, s14
1971; GCN-NEXT:    s_add_i32 s12, s15, s7
1972; GCN-NEXT:    s_xor_b32 s12, s12, s7
1973; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 0, v6
1974; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v7
1975; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[4:5]
1976; GCN-NEXT:    v_mul_hi_u32 v6, v6, v3
1977; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
1978; GCN-NEXT:    v_add_i32_e32 v5, vcc, s13, v4
1979; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s13, v4
1980; GCN-NEXT:    v_add_i32_e32 v7, vcc, v6, v3
1981; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v6, v3
1982; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[4:5]
1983; GCN-NEXT:    v_mul_hi_u32 v3, v3, s12
1984; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1985; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
1986; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
1987; GCN-NEXT:    v_mul_lo_u32 v3, v3, s14
1988; GCN-NEXT:    v_xor_b32_e32 v2, s6, v2
1989; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v2
1990; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s12, v3
1991; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s12, v3
1992; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v4
1993; GCN-NEXT:    v_add_i32_e32 v5, vcc, s14, v4
1994; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s14, v4
1995; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1996; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
1997; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[2:3]
1998; GCN-NEXT:    v_xor_b32_e32 v3, s7, v3
1999; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s7, v3
2000; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
2001; GCN-NEXT:    s_endpgm
2002  %r = srem <4 x i32> %x, %y
2003  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
2004  ret void
2005}
2006
2007define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2008; CHECK-LABEL: @udiv_v4i16(
2009; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2010; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2011; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2012; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2013; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2014; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2015; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2016; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2017; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2018; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2019; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2020; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2021; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2022; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2023; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2024; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2025; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2026; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
2027; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
2028; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0
2029; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1
2030; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2031; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
2032; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
2033; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
2034; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
2035; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
2036; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
2037; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
2038; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
2039; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
2040; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
2041; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
2042; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
2043; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
2044; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
2045; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
2046; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
2047; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
2048; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1
2049; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2
2050; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2051; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
2052; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
2053; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
2054; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
2055; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
2056; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
2057; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
2058; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
2059; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
2060; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
2061; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
2062; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
2063; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
2064; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
2065; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
2066; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
2067; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
2068; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2
2069; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3
2070; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2071; CHECK-NEXT:    [[TMP63:%.*]] = zext i16 [[TMP61]] to i32
2072; CHECK-NEXT:    [[TMP64:%.*]] = zext i16 [[TMP62]] to i32
2073; CHECK-NEXT:    [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float
2074; CHECK-NEXT:    [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float
2075; CHECK-NEXT:    [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]])
2076; CHECK-NEXT:    [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]]
2077; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]])
2078; CHECK-NEXT:    [[TMP70:%.*]] = fneg fast float [[TMP69]]
2079; CHECK-NEXT:    [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]])
2080; CHECK-NEXT:    [[TMP72:%.*]] = fptoui float [[TMP69]] to i32
2081; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]])
2082; CHECK-NEXT:    [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
2083; CHECK-NEXT:    [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]]
2084; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0
2085; CHECK-NEXT:    [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]]
2086; CHECK-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 65535
2087; CHECK-NEXT:    [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16
2088; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3
2089; CHECK-NEXT:    store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]]
2090; CHECK-NEXT:    ret void
2091;
2092; GCN-LABEL: udiv_v4i16:
2093; GCN:       ; %bb.0:
2094; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2095; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2096; GCN-NEXT:    s_mov_b32 s8, 0xffff
2097; GCN-NEXT:    s_mov_b32 s7, 0xf000
2098; GCN-NEXT:    s_mov_b32 s6, -1
2099; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2100; GCN-NEXT:    s_and_b32 s9, s2, s8
2101; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
2102; GCN-NEXT:    s_lshr_b32 s9, s0, 16
2103; GCN-NEXT:    s_and_b32 s0, s0, s8
2104; GCN-NEXT:    s_lshr_b32 s2, s2, 16
2105; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s2
2106; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s0
2107; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2108; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s9
2109; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
2110; GCN-NEXT:    s_and_b32 s2, s3, s8
2111; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2112; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2113; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2114; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
2115; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2116; GCN-NEXT:    v_mul_f32_e32 v1, v4, v5
2117; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2118; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
2119; GCN-NEXT:    v_mad_f32 v2, -v1, v3, v4
2120; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s2
2121; GCN-NEXT:    s_lshr_b32 s0, s1, 16
2122; GCN-NEXT:    s_and_b32 s1, s1, s8
2123; GCN-NEXT:    s_lshr_b32 s10, s3, 16
2124; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
2125; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
2126; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s10
2127; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s1
2128; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2129; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
2130; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v3
2131; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2132; GCN-NEXT:    v_mul_f32_e32 v1, v5, v6
2133; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s0
2134; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2135; GCN-NEXT:    v_mad_f32 v5, -v1, v4, v5
2136; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
2137; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
2138; GCN-NEXT:    v_mul_f32_e32 v4, v6, v7
2139; GCN-NEXT:    v_trunc_f32_e32 v4, v4
2140; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
2141; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2142; GCN-NEXT:    v_mad_f32 v4, -v4, v3, v6
2143; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
2144; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2145; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
2146; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2147; GCN-NEXT:    v_and_b32_e32 v1, s8, v1
2148; GCN-NEXT:    v_or_b32_e32 v1, v1, v3
2149; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
2150; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2151; GCN-NEXT:    s_endpgm
2152  %r = udiv <4 x i16> %x, %y
2153  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2154  ret void
2155}
2156
2157define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2158; CHECK-LABEL: @urem_v4i16(
2159; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2160; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2161; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2162; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2163; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2164; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2165; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2166; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2167; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2168; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2169; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2170; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2171; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2172; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2173; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2174; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2175; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2176; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
2177; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
2178; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
2179; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
2180; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0
2181; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1
2182; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2183; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
2184; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
2185; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
2186; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
2187; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
2188; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
2189; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
2190; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
2191; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
2192; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
2193; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2194; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
2195; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
2196; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
2197; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
2198; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
2199; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
2200; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
2201; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
2202; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1
2203; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2
2204; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2205; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
2206; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
2207; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
2208; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
2209; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
2210; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
2211; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
2212; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
2213; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
2214; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
2215; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
2216; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
2217; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
2218; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
2219; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
2220; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
2221; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
2222; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
2223; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
2224; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2
2225; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3
2226; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2227; CHECK-NEXT:    [[TMP69:%.*]] = zext i16 [[TMP67]] to i32
2228; CHECK-NEXT:    [[TMP70:%.*]] = zext i16 [[TMP68]] to i32
2229; CHECK-NEXT:    [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float
2230; CHECK-NEXT:    [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float
2231; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]])
2232; CHECK-NEXT:    [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]]
2233; CHECK-NEXT:    [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]])
2234; CHECK-NEXT:    [[TMP76:%.*]] = fneg fast float [[TMP75]]
2235; CHECK-NEXT:    [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]])
2236; CHECK-NEXT:    [[TMP78:%.*]] = fptoui float [[TMP75]] to i32
2237; CHECK-NEXT:    [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]])
2238; CHECK-NEXT:    [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]])
2239; CHECK-NEXT:    [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]]
2240; CHECK-NEXT:    [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0
2241; CHECK-NEXT:    [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]]
2242; CHECK-NEXT:    [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]]
2243; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]]
2244; CHECK-NEXT:    [[TMP86:%.*]] = and i32 [[TMP85]], 65535
2245; CHECK-NEXT:    [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16
2246; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3
2247; CHECK-NEXT:    store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]]
2248; CHECK-NEXT:    ret void
2249;
2250; GCN-LABEL: urem_v4i16:
2251; GCN:       ; %bb.0:
2252; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2253; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2254; GCN-NEXT:    s_mov_b32 s8, 0xffff
2255; GCN-NEXT:    s_mov_b32 s7, 0xf000
2256; GCN-NEXT:    s_mov_b32 s6, -1
2257; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2258; GCN-NEXT:    s_and_b32 s9, s2, s8
2259; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
2260; GCN-NEXT:    s_and_b32 s10, s0, s8
2261; GCN-NEXT:    s_lshr_b32 s11, s2, 16
2262; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s10
2263; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2264; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s11
2265; GCN-NEXT:    s_lshr_b32 s9, s0, 16
2266; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s9
2267; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2268; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
2269; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2270; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2271; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
2272; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2273; GCN-NEXT:    v_mul_f32_e32 v1, v4, v5
2274; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2275; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
2276; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
2277; GCN-NEXT:    v_mad_f32 v1, -v1, v3, v4
2278; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v3
2279; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
2280; GCN-NEXT:    s_and_b32 s2, s3, s8
2281; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
2282; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s2
2283; GCN-NEXT:    s_and_b32 s2, s1, s8
2284; GCN-NEXT:    v_mul_lo_u32 v1, v1, s11
2285; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s2
2286; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2287; GCN-NEXT:    s_lshr_b32 s12, s3, 16
2288; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s9, v1
2289; GCN-NEXT:    s_lshr_b32 s10, s1, 16
2290; GCN-NEXT:    v_mul_f32_e32 v1, v3, v4
2291; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s12
2292; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s10
2293; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2294; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2295; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v4
2296; GCN-NEXT:    v_mad_f32 v3, -v1, v2, v3
2297; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
2298; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
2299; GCN-NEXT:    v_mul_f32_e32 v2, v6, v7
2300; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2301; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
2302; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2303; GCN-NEXT:    v_mad_f32 v2, -v2, v4, v6
2304; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
2305; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
2306; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
2307; GCN-NEXT:    v_mul_lo_u32 v2, v2, s12
2308; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
2309; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
2310; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
2311; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2312; GCN-NEXT:    v_and_b32_e32 v1, s8, v1
2313; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
2314; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
2315; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
2316; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2317; GCN-NEXT:    s_endpgm
2318  %r = urem <4 x i16> %x, %y
2319  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2320  ret void
2321}
2322
2323define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2324; CHECK-LABEL: @sdiv_v4i16(
2325; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2326; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2327; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2328; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2329; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2330; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2331; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2332; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2333; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2334; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2335; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2336; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2337; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2338; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2339; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2340; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2341; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2342; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2343; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2344; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2345; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
2346; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
2347; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
2348; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0
2349; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1
2350; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2351; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
2352; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
2353; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
2354; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
2355; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
2356; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
2357; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
2358; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
2359; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
2360; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
2361; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
2362; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
2363; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
2364; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
2365; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2366; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
2367; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
2368; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
2369; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
2370; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
2371; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
2372; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1
2373; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2
2374; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2375; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
2376; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
2377; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
2378; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
2379; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
2380; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
2381; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
2382; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
2383; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
2384; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
2385; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
2386; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
2387; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
2388; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
2389; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
2390; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
2391; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
2392; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
2393; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
2394; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
2395; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
2396; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2
2397; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3
2398; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2399; CHECK-NEXT:    [[TMP75:%.*]] = sext i16 [[TMP73]] to i32
2400; CHECK-NEXT:    [[TMP76:%.*]] = sext i16 [[TMP74]] to i32
2401; CHECK-NEXT:    [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]]
2402; CHECK-NEXT:    [[TMP78:%.*]] = ashr i32 [[TMP77]], 30
2403; CHECK-NEXT:    [[TMP79:%.*]] = or i32 [[TMP78]], 1
2404; CHECK-NEXT:    [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float
2405; CHECK-NEXT:    [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float
2406; CHECK-NEXT:    [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]])
2407; CHECK-NEXT:    [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]]
2408; CHECK-NEXT:    [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]])
2409; CHECK-NEXT:    [[TMP85:%.*]] = fneg fast float [[TMP84]]
2410; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]])
2411; CHECK-NEXT:    [[TMP87:%.*]] = fptosi float [[TMP84]] to i32
2412; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]])
2413; CHECK-NEXT:    [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]])
2414; CHECK-NEXT:    [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]]
2415; CHECK-NEXT:    [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0
2416; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]]
2417; CHECK-NEXT:    [[TMP93:%.*]] = shl i32 [[TMP92]], 16
2418; CHECK-NEXT:    [[TMP94:%.*]] = ashr i32 [[TMP93]], 16
2419; CHECK-NEXT:    [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16
2420; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3
2421; CHECK-NEXT:    store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]]
2422; CHECK-NEXT:    ret void
2423;
2424; GCN-LABEL: sdiv_v4i16:
2425; GCN:       ; %bb.0:
2426; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2427; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2428; GCN-NEXT:    s_mov_b32 s7, 0xf000
2429; GCN-NEXT:    s_mov_b32 s6, -1
2430; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2431; GCN-NEXT:    s_sext_i32_i16 s8, s2
2432; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
2433; GCN-NEXT:    s_sext_i32_i16 s9, s0
2434; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
2435; GCN-NEXT:    s_xor_b32 s8, s9, s8
2436; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2437; GCN-NEXT:    s_ashr_i32 s2, s2, 16
2438; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2439; GCN-NEXT:    s_or_b32 s8, s8, 1
2440; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2441; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2442; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2443; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
2444; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2445; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
2446; GCN-NEXT:    v_mov_b32_e32 v3, s8
2447; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2448; GCN-NEXT:    s_ashr_i32 s0, s0, 16
2449; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2450; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2451; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
2452; GCN-NEXT:    s_xor_b32 s0, s0, s2
2453; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2454; GCN-NEXT:    s_or_b32 s0, s0, 1
2455; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
2456; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2457; GCN-NEXT:    v_mad_f32 v2, -v3, v1, v2
2458; GCN-NEXT:    v_mov_b32_e32 v4, s0
2459; GCN-NEXT:    s_sext_i32_i16 s0, s3
2460; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
2461; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
2462; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2463; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
2464; GCN-NEXT:    s_sext_i32_i16 s2, s1
2465; GCN-NEXT:    v_add_i32_e32 v3, vcc, v1, v3
2466; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
2467; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2468; GCN-NEXT:    s_xor_b32 s0, s2, s0
2469; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2470; GCN-NEXT:    s_or_b32 s0, s0, 1
2471; GCN-NEXT:    v_mul_f32_e32 v4, v1, v4
2472; GCN-NEXT:    v_trunc_f32_e32 v4, v4
2473; GCN-NEXT:    v_mad_f32 v1, -v4, v2, v1
2474; GCN-NEXT:    v_mov_b32_e32 v5, s0
2475; GCN-NEXT:    s_ashr_i32 s0, s3, 16
2476; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
2477; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
2478; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2479; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
2480; GCN-NEXT:    s_ashr_i32 s1, s1, 16
2481; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
2482; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s1
2483; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
2484; GCN-NEXT:    s_xor_b32 s0, s1, s0
2485; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2486; GCN-NEXT:    s_or_b32 s0, s0, 1
2487; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
2488; GCN-NEXT:    v_trunc_f32_e32 v5, v5
2489; GCN-NEXT:    v_mad_f32 v4, -v5, v2, v4
2490; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
2491; GCN-NEXT:    v_mov_b32_e32 v6, s0
2492; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
2493; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
2494; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
2495; GCN-NEXT:    s_mov_b32 s0, 0xffff
2496; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2497; GCN-NEXT:    v_and_b32_e32 v1, s0, v1
2498; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
2499; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
2500; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
2501; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
2502; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2503; GCN-NEXT:    s_endpgm
2504  %r = sdiv <4 x i16> %x, %y
2505  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2506  ret void
2507}
2508
2509define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2510; CHECK-LABEL: @srem_v4i16(
2511; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2512; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2513; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2514; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2515; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2516; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2517; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2518; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2519; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2520; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2521; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2522; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2523; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2524; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2525; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2526; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2527; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2528; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2529; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2530; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2531; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
2532; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
2533; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
2534; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
2535; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
2536; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0
2537; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1
2538; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2539; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
2540; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
2541; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
2542; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
2543; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
2544; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
2545; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
2546; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
2547; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
2548; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
2549; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
2550; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
2551; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
2552; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
2553; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
2554; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
2555; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
2556; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
2557; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
2558; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
2559; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
2560; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
2561; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
2562; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1
2563; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2
2564; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2565; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
2566; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
2567; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
2568; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
2569; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
2570; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
2571; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
2572; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
2573; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
2574; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
2575; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
2576; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
2577; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
2578; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
2579; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
2580; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
2581; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
2582; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
2583; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
2584; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
2585; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
2586; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
2587; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
2588; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2
2589; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3
2590; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2591; CHECK-NEXT:    [[TMP81:%.*]] = sext i16 [[TMP79]] to i32
2592; CHECK-NEXT:    [[TMP82:%.*]] = sext i16 [[TMP80]] to i32
2593; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]]
2594; CHECK-NEXT:    [[TMP84:%.*]] = ashr i32 [[TMP83]], 30
2595; CHECK-NEXT:    [[TMP85:%.*]] = or i32 [[TMP84]], 1
2596; CHECK-NEXT:    [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float
2597; CHECK-NEXT:    [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float
2598; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]])
2599; CHECK-NEXT:    [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]]
2600; CHECK-NEXT:    [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]])
2601; CHECK-NEXT:    [[TMP91:%.*]] = fneg fast float [[TMP90]]
2602; CHECK-NEXT:    [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]])
2603; CHECK-NEXT:    [[TMP93:%.*]] = fptosi float [[TMP90]] to i32
2604; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]])
2605; CHECK-NEXT:    [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]])
2606; CHECK-NEXT:    [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]]
2607; CHECK-NEXT:    [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0
2608; CHECK-NEXT:    [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]]
2609; CHECK-NEXT:    [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]]
2610; CHECK-NEXT:    [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]]
2611; CHECK-NEXT:    [[TMP101:%.*]] = shl i32 [[TMP100]], 16
2612; CHECK-NEXT:    [[TMP102:%.*]] = ashr i32 [[TMP101]], 16
2613; CHECK-NEXT:    [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16
2614; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3
2615; CHECK-NEXT:    store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]]
2616; CHECK-NEXT:    ret void
2617;
2618; GCN-LABEL: srem_v4i16:
2619; GCN:       ; %bb.0:
2620; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2621; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2622; GCN-NEXT:    s_mov_b32 s7, 0xf000
2623; GCN-NEXT:    s_mov_b32 s6, -1
2624; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2625; GCN-NEXT:    s_sext_i32_i16 s8, s2
2626; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
2627; GCN-NEXT:    s_sext_i32_i16 s9, s0
2628; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
2629; GCN-NEXT:    s_xor_b32 s8, s9, s8
2630; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2631; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2632; GCN-NEXT:    s_or_b32 s8, s8, 1
2633; GCN-NEXT:    v_mov_b32_e32 v3, s8
2634; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2635; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2636; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2637; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2638; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
2639; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2640; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2641; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
2642; GCN-NEXT:    s_ashr_i32 s2, s2, 16
2643; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
2644; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2645; GCN-NEXT:    s_ashr_i32 s0, s0, 16
2646; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2647; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
2648; GCN-NEXT:    s_xor_b32 s8, s0, s2
2649; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2650; GCN-NEXT:    s_or_b32 s8, s8, 1
2651; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
2652; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2653; GCN-NEXT:    v_mad_f32 v2, -v3, v1, v2
2654; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
2655; GCN-NEXT:    v_mov_b32_e32 v4, s8
2656; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
2657; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
2658; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
2659; GCN-NEXT:    v_mul_lo_u32 v1, v1, s2
2660; GCN-NEXT:    s_sext_i32_i16 s2, s3
2661; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s2
2662; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s0, v1
2663; GCN-NEXT:    s_sext_i32_i16 s0, s1
2664; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
2665; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2666; GCN-NEXT:    s_xor_b32 s0, s0, s2
2667; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2668; GCN-NEXT:    s_or_b32 s0, s0, 1
2669; GCN-NEXT:    v_mul_f32_e32 v4, v1, v4
2670; GCN-NEXT:    v_trunc_f32_e32 v4, v4
2671; GCN-NEXT:    v_mad_f32 v1, -v4, v2, v1
2672; GCN-NEXT:    v_mov_b32_e32 v5, s0
2673; GCN-NEXT:    s_ashr_i32 s0, s3, 16
2674; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
2675; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
2676; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2677; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
2678; GCN-NEXT:    s_ashr_i32 s2, s1, 16
2679; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
2680; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s2
2681; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
2682; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
2683; GCN-NEXT:    s_xor_b32 s3, s2, s0
2684; GCN-NEXT:    s_ashr_i32 s3, s3, 30
2685; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
2686; GCN-NEXT:    v_trunc_f32_e32 v5, v5
2687; GCN-NEXT:    v_mad_f32 v4, -v5, v2, v4
2688; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
2689; GCN-NEXT:    s_or_b32 s3, s3, 1
2690; GCN-NEXT:    v_mov_b32_e32 v6, s3
2691; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
2692; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
2693; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
2694; GCN-NEXT:    v_mul_lo_u32 v2, v2, s0
2695; GCN-NEXT:    s_mov_b32 s0, 0xffff
2696; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
2697; GCN-NEXT:    v_and_b32_e32 v1, s0, v1
2698; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
2699; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2700; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
2701; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
2702; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
2703; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
2704; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2705; GCN-NEXT:    s_endpgm
2706  %r = srem <4 x i16> %x, %y
2707  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2708  ret void
2709}
2710
2711define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2712; CHECK-LABEL: @udiv_i3(
2713; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
2714; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
2715; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
2716; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
2717; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
2718; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
2719; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
2720; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
2721; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
2722; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
2723; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2724; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
2725; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
2726; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
2727; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
2728; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 7
2729; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3
2730; CHECK-NEXT:    store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]]
2731; CHECK-NEXT:    ret void
2732;
2733; GCN-LABEL: udiv_i3:
2734; GCN:       ; %bb.0:
2735; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2736; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2737; GCN-NEXT:    s_mov_b32 s7, 0xf000
2738; GCN-NEXT:    s_mov_b32 s6, -1
2739; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2740; GCN-NEXT:    s_bfe_u32 s1, s0, 0x30008
2741; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, s1
2742; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
2743; GCN-NEXT:    s_and_b32 s0, s0, 7
2744; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
2745; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
2746; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2747; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
2748; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
2749; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2750; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
2751; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2752; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2753; GCN-NEXT:    s_endpgm
2754  %r = udiv i3 %x, %y
2755  store i3 %r, i3 addrspace(1)* %out
2756  ret void
2757}
2758
2759define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2760; CHECK-LABEL: @urem_i3(
2761; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
2762; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
2763; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
2764; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
2765; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
2766; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
2767; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
2768; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
2769; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
2770; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
2771; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2772; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
2773; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
2774; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
2775; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
2776; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
2777; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
2778; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 7
2779; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3
2780; CHECK-NEXT:    store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]]
2781; CHECK-NEXT:    ret void
2782;
2783; GCN-LABEL: urem_i3:
2784; GCN:       ; %bb.0:
2785; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2786; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2787; GCN-NEXT:    s_mov_b32 s7, 0xf000
2788; GCN-NEXT:    s_mov_b32 s6, -1
2789; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2790; GCN-NEXT:    s_bfe_u32 s1, s0, 0x30008
2791; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, s1
2792; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
2793; GCN-NEXT:    s_and_b32 s2, s0, 7
2794; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
2795; GCN-NEXT:    s_lshr_b32 s1, s0, 8
2796; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
2797; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2798; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
2799; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
2800; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2801; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
2802; GCN-NEXT:    v_mul_lo_u32 v0, v0, s1
2803; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2804; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2805; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2806; GCN-NEXT:    s_endpgm
2807  %r = urem i3 %x, %y
2808  store i3 %r, i3 addrspace(1)* %out
2809  ret void
2810}
2811
2812define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2813; CHECK-LABEL: @sdiv_i3(
2814; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
2815; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
2816; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
2817; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
2818; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
2819; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
2820; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
2821; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
2822; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
2823; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
2824; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
2825; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
2826; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
2827; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
2828; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
2829; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
2830; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
2831; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
2832; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 29
2833; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 29
2834; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3
2835; CHECK-NEXT:    store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]]
2836; CHECK-NEXT:    ret void
2837;
2838; GCN-LABEL: sdiv_i3:
2839; GCN:       ; %bb.0:
2840; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2841; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2842; GCN-NEXT:    s_mov_b32 s7, 0xf000
2843; GCN-NEXT:    s_mov_b32 s6, -1
2844; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2845; GCN-NEXT:    s_bfe_i32 s1, s0, 0x30008
2846; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
2847; GCN-NEXT:    s_bfe_i32 s0, s0, 0x30000
2848; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
2849; GCN-NEXT:    s_xor_b32 s0, s0, s1
2850; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2851; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2852; GCN-NEXT:    s_or_b32 s0, s0, 1
2853; GCN-NEXT:    v_mov_b32_e32 v3, s0
2854; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2855; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2856; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2857; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2858; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
2859; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2860; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2861; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2862; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2863; GCN-NEXT:    s_endpgm
2864  %r = sdiv i3 %x, %y
2865  store i3 %r, i3 addrspace(1)* %out
2866  ret void
2867}
2868
2869define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2870; CHECK-LABEL: @srem_i3(
2871; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
2872; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
2873; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
2874; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
2875; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
2876; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
2877; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
2878; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
2879; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
2880; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
2881; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
2882; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
2883; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
2884; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
2885; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
2886; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
2887; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
2888; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
2889; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
2890; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
2891; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 29
2892; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 29
2893; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3
2894; CHECK-NEXT:    store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]]
2895; CHECK-NEXT:    ret void
2896;
2897; GCN-LABEL: srem_i3:
2898; GCN:       ; %bb.0:
2899; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2900; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2901; GCN-NEXT:    s_mov_b32 s7, 0xf000
2902; GCN-NEXT:    s_mov_b32 s6, -1
2903; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2904; GCN-NEXT:    s_bfe_i32 s1, s0, 0x30008
2905; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
2906; GCN-NEXT:    s_bfe_i32 s3, s0, 0x30000
2907; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
2908; GCN-NEXT:    s_xor_b32 s1, s3, s1
2909; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2910; GCN-NEXT:    s_ashr_i32 s1, s1, 30
2911; GCN-NEXT:    s_or_b32 s1, s1, 1
2912; GCN-NEXT:    v_mov_b32_e32 v3, s1
2913; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2914; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2915; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2916; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2917; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
2918; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2919; GCN-NEXT:    s_lshr_b32 s2, s0, 8
2920; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2921; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
2922; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2923; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2924; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2925; GCN-NEXT:    s_endpgm
2926  %r = srem i3 %x, %y
2927  store i3 %r, i3 addrspace(1)* %out
2928  ret void
2929}
2930
2931define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
2932; CHECK-LABEL: @udiv_v3i16(
2933; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
2934; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
2935; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2936; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2937; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2938; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2939; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2940; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2941; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2942; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2943; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2944; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2945; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2946; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2947; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2948; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2949; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2950; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
2951; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
2952; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0
2953; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1
2954; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1
2955; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
2956; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
2957; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
2958; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
2959; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
2960; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
2961; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
2962; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
2963; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
2964; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
2965; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
2966; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
2967; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
2968; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
2969; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
2970; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
2971; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
2972; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1
2973; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2
2974; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2
2975; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
2976; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
2977; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
2978; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
2979; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
2980; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
2981; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
2982; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
2983; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
2984; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
2985; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
2986; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
2987; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
2988; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
2989; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
2990; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
2991; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
2992; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2
2993; CHECK-NEXT:    store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]]
2994; CHECK-NEXT:    ret void
2995;
2996; GCN-LABEL: udiv_v3i16:
2997; GCN:       ; %bb.0:
2998; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2999; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3000; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3001; GCN-NEXT:    s_mov_b32 s8, 0xffff
3002; GCN-NEXT:    s_mov_b32 s7, 0xf000
3003; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3004; GCN-NEXT:    s_and_b32 s6, s0, s8
3005; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
3006; GCN-NEXT:    s_and_b32 s6, s2, s8
3007; GCN-NEXT:    s_lshr_b32 s0, s0, 16
3008; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s0
3009; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s6
3010; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3011; GCN-NEXT:    s_lshr_b32 s0, s2, 16
3012; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s0
3013; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3014; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
3015; GCN-NEXT:    v_trunc_f32_e32 v2, v2
3016; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
3017; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
3018; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3019; GCN-NEXT:    v_mul_f32_e32 v1, v4, v5
3020; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3021; GCN-NEXT:    s_and_b32 s0, s1, s8
3022; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
3023; GCN-NEXT:    v_mad_f32 v2, -v1, v3, v4
3024; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s0
3025; GCN-NEXT:    s_and_b32 s0, s3, s8
3026; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s0
3027; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
3028; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3029; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
3030; GCN-NEXT:    s_mov_b32 s6, -1
3031; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3032; GCN-NEXT:    v_mul_f32_e32 v2, v5, v6
3033; GCN-NEXT:    v_trunc_f32_e32 v2, v2
3034; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
3035; GCN-NEXT:    v_mad_f32 v2, -v2, v4, v5
3036; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
3037; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3038; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
3039; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
3040; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
3041; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
3042; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3043; GCN-NEXT:    s_endpgm
3044  %r = udiv <3 x i16> %x, %y
3045  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
3046  ret void
3047}
3048
3049define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
3050; CHECK-LABEL: @urem_v3i16(
3051; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3052; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3053; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3054; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3055; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3056; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3057; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3058; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3059; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3060; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3061; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3062; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3063; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3064; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3065; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3066; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3067; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3068; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
3069; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
3070; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
3071; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
3072; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0
3073; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1
3074; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3075; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
3076; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
3077; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
3078; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
3079; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
3080; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
3081; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
3082; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
3083; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
3084; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
3085; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3086; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
3087; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
3088; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
3089; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
3090; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
3091; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
3092; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
3093; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
3094; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1
3095; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2
3096; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3097; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
3098; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
3099; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
3100; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
3101; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
3102; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
3103; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
3104; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
3105; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
3106; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
3107; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
3108; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
3109; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
3110; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
3111; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
3112; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
3113; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
3114; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
3115; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
3116; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2
3117; CHECK-NEXT:    store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]]
3118; CHECK-NEXT:    ret void
3119;
3120; GCN-LABEL: urem_v3i16:
3121; GCN:       ; %bb.0:
3122; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3123; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3124; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3125; GCN-NEXT:    s_mov_b32 s8, 0xffff
3126; GCN-NEXT:    s_mov_b32 s7, 0xf000
3127; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3128; GCN-NEXT:    v_mov_b32_e32 v1, s2
3129; GCN-NEXT:    s_and_b32 s6, s0, s8
3130; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
3131; GCN-NEXT:    s_and_b32 s6, s2, s8
3132; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s6
3133; GCN-NEXT:    v_mov_b32_e32 v4, s0
3134; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3135; GCN-NEXT:    v_alignbit_b32 v4, s1, v4, 16
3136; GCN-NEXT:    v_and_b32_e32 v5, s8, v4
3137; GCN-NEXT:    v_alignbit_b32 v1, s3, v1, 16
3138; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
3139; GCN-NEXT:    v_trunc_f32_e32 v3, v3
3140; GCN-NEXT:    v_mad_f32 v2, -v3, v0, v2
3141; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v3
3142; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
3143; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v5
3144; GCN-NEXT:    v_and_b32_e32 v3, s8, v1
3145; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
3146; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
3147; GCN-NEXT:    s_and_b32 s0, s1, s8
3148; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v3
3149; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
3150; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s0
3151; GCN-NEXT:    s_and_b32 s0, s3, s8
3152; GCN-NEXT:    v_cvt_f32_u32_e32 v7, s0
3153; GCN-NEXT:    v_mul_f32_e32 v5, v3, v5
3154; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3155; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v6
3156; GCN-NEXT:    v_mad_f32 v3, -v5, v2, v3
3157; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
3158; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
3159; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
3160; GCN-NEXT:    v_mul_f32_e32 v3, v7, v8
3161; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
3162; GCN-NEXT:    v_trunc_f32_e32 v3, v3
3163; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
3164; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v3
3165; GCN-NEXT:    v_mad_f32 v3, -v3, v6, v7
3166; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v6
3167; GCN-NEXT:    s_mov_b32 s6, -1
3168; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
3169; GCN-NEXT:    v_mul_lo_u32 v3, v3, s1
3170; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
3171; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3172; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
3173; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
3174; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
3175; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
3176; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3177; GCN-NEXT:    s_endpgm
3178  %r = urem <3 x i16> %x, %y
3179  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
3180  ret void
3181}
3182
3183define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
3184; CHECK-LABEL: @sdiv_v3i16(
3185; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3186; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3187; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
3188; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
3189; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3190; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3191; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3192; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3193; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3194; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3195; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3196; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3197; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3198; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3199; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3200; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3201; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3202; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3203; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3204; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3205; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
3206; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
3207; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
3208; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0
3209; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1
3210; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3211; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
3212; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
3213; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
3214; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
3215; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
3216; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
3217; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
3218; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
3219; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
3220; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
3221; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
3222; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
3223; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
3224; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
3225; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3226; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
3227; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
3228; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
3229; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
3230; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
3231; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
3232; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1
3233; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2
3234; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3235; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
3236; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
3237; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
3238; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
3239; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
3240; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
3241; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
3242; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
3243; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
3244; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
3245; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
3246; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
3247; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
3248; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
3249; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
3250; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
3251; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
3252; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
3253; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
3254; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
3255; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
3256; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2
3257; CHECK-NEXT:    store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]]
3258; CHECK-NEXT:    ret void
3259;
3260; GCN-LABEL: sdiv_v3i16:
3261; GCN:       ; %bb.0:
3262; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3263; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3264; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3265; GCN-NEXT:    s_mov_b32 s7, 0xf000
3266; GCN-NEXT:    s_mov_b32 s6, -1
3267; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3268; GCN-NEXT:    s_sext_i32_i16 s9, s2
3269; GCN-NEXT:    s_sext_i32_i16 s8, s0
3270; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
3271; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
3272; GCN-NEXT:    s_xor_b32 s8, s9, s8
3273; GCN-NEXT:    s_ashr_i32 s0, s0, 16
3274; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3275; GCN-NEXT:    s_ashr_i32 s8, s8, 30
3276; GCN-NEXT:    s_or_b32 s8, s8, 1
3277; GCN-NEXT:    v_mov_b32_e32 v3, s8
3278; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
3279; GCN-NEXT:    v_trunc_f32_e32 v2, v2
3280; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
3281; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3282; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
3283; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
3284; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3285; GCN-NEXT:    s_ashr_i32 s2, s2, 16
3286; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3287; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s2
3288; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
3289; GCN-NEXT:    s_xor_b32 s0, s2, s0
3290; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3291; GCN-NEXT:    s_or_b32 s0, s0, 1
3292; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
3293; GCN-NEXT:    v_trunc_f32_e32 v3, v3
3294; GCN-NEXT:    v_mad_f32 v2, -v3, v1, v2
3295; GCN-NEXT:    v_mov_b32_e32 v4, s0
3296; GCN-NEXT:    s_sext_i32_i16 s0, s1
3297; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
3298; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
3299; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
3300; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
3301; GCN-NEXT:    s_sext_i32_i16 s1, s3
3302; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
3303; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s1
3304; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3305; GCN-NEXT:    s_xor_b32 s0, s1, s0
3306; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3307; GCN-NEXT:    s_or_b32 s0, s0, 1
3308; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3309; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3310; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
3311; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
3312; GCN-NEXT:    v_mov_b32_e32 v5, s0
3313; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
3314; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
3315; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
3316; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3317; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3318; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
3319; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
3320; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3321; GCN-NEXT:    s_endpgm
3322  %r = sdiv <3 x i16> %x, %y
3323  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
3324  ret void
3325}
3326
3327define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
3328; CHECK-LABEL: @srem_v3i16(
3329; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3330; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3331; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
3332; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
3333; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3334; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3335; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3336; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3337; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3338; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3339; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3340; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3341; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3342; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3343; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3344; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3345; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3346; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3347; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3348; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3349; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
3350; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
3351; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
3352; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
3353; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
3354; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0
3355; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1
3356; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3357; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
3358; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
3359; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
3360; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
3361; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
3362; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
3363; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
3364; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3365; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
3366; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
3367; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
3368; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
3369; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
3370; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
3371; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
3372; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
3373; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
3374; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
3375; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
3376; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
3377; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
3378; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
3379; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
3380; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1
3381; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2
3382; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3383; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
3384; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
3385; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
3386; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
3387; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
3388; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
3389; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
3390; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
3391; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
3392; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
3393; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
3394; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
3395; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
3396; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3397; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
3398; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
3399; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
3400; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
3401; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
3402; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
3403; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
3404; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
3405; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
3406; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2
3407; CHECK-NEXT:    store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]]
3408; CHECK-NEXT:    ret void
3409;
3410; GCN-LABEL: srem_v3i16:
3411; GCN:       ; %bb.0:
3412; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3413; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3414; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3415; GCN-NEXT:    s_mov_b32 s7, 0xf000
3416; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3417; GCN-NEXT:    s_sext_i32_i16 s8, s2
3418; GCN-NEXT:    s_sext_i32_i16 s6, s0
3419; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s6
3420; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s8
3421; GCN-NEXT:    s_xor_b32 s6, s8, s6
3422; GCN-NEXT:    s_ashr_i32 s6, s6, 30
3423; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3424; GCN-NEXT:    s_or_b32 s6, s6, 1
3425; GCN-NEXT:    v_mov_b32_e32 v3, s6
3426; GCN-NEXT:    s_mov_b32 s6, -1
3427; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
3428; GCN-NEXT:    v_trunc_f32_e32 v2, v2
3429; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
3430; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
3431; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3432; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3433; GCN-NEXT:    v_mov_b32_e32 v1, s2
3434; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3435; GCN-NEXT:    v_mov_b32_e32 v2, s0
3436; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 16
3437; GCN-NEXT:    v_bfe_i32 v3, v2, 0, 16
3438; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v3
3439; GCN-NEXT:    v_alignbit_b32 v1, s3, v1, 16
3440; GCN-NEXT:    v_bfe_i32 v5, v1, 0, 16
3441; GCN-NEXT:    v_cvt_f32_i32_e32 v6, v5
3442; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v4
3443; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
3444; GCN-NEXT:    v_xor_b32_e32 v3, v5, v3
3445; GCN-NEXT:    s_sext_i32_i16 s0, s1
3446; GCN-NEXT:    v_mul_f32_e32 v5, v6, v7
3447; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3448; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
3449; GCN-NEXT:    v_mad_f32 v6, -v5, v4, v6
3450; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3451; GCN-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
3452; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
3453; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s0
3454; GCN-NEXT:    v_or_b32_e32 v3, 1, v3
3455; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
3456; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
3457; GCN-NEXT:    s_sext_i32_i16 s2, s3
3458; GCN-NEXT:    v_mul_lo_u32 v2, v3, v2
3459; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s2
3460; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v4
3461; GCN-NEXT:    s_xor_b32 s0, s2, s0
3462; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3463; GCN-NEXT:    s_or_b32 s0, s0, 1
3464; GCN-NEXT:    v_mul_f32_e32 v5, v3, v5
3465; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3466; GCN-NEXT:    v_mad_f32 v3, -v5, v4, v3
3467; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3468; GCN-NEXT:    v_mov_b32_e32 v6, s0
3469; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
3470; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
3471; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
3472; GCN-NEXT:    v_mul_lo_u32 v3, v3, s1
3473; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
3474; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3475; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3476; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
3477; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
3478; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
3479; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3480; GCN-NEXT:    s_endpgm
3481  %r = srem <3 x i16> %x, %y
3482  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
3483  ret void
3484}
3485
3486define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3487; CHECK-LABEL: @udiv_v3i15(
3488; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3489; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3490; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
3491; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
3492; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3493; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3494; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3495; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3496; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3497; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3498; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3499; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3500; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3501; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3502; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3503; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3504; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3505; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 32767
3506; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15
3507; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0
3508; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1
3509; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3510; CHECK-NEXT:    [[TMP23:%.*]] = zext i15 [[TMP21]] to i32
3511; CHECK-NEXT:    [[TMP24:%.*]] = zext i15 [[TMP22]] to i32
3512; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
3513; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
3514; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
3515; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
3516; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
3517; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
3518; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
3519; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
3520; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
3521; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
3522; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
3523; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
3524; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
3525; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 32767
3526; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15
3527; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1
3528; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2
3529; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3530; CHECK-NEXT:    [[TMP43:%.*]] = zext i15 [[TMP41]] to i32
3531; CHECK-NEXT:    [[TMP44:%.*]] = zext i15 [[TMP42]] to i32
3532; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
3533; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
3534; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
3535; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
3536; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
3537; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
3538; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
3539; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
3540; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
3541; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
3542; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
3543; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
3544; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
3545; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 32767
3546; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15
3547; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2
3548; CHECK-NEXT:    store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]]
3549; CHECK-NEXT:    ret void
3550;
3551; GCN-LABEL: udiv_v3i15:
3552; GCN:       ; %bb.0:
3553; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3554; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3555; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3556; GCN-NEXT:    s_mov_b32 s7, 0xf000
3557; GCN-NEXT:    s_mov_b32 s6, -1
3558; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3559; GCN-NEXT:    v_mov_b32_e32 v0, s2
3560; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3561; GCN-NEXT:    s_movk_i32 s3, 0x7fff
3562; GCN-NEXT:    s_and_b32 s9, s0, s3
3563; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
3564; GCN-NEXT:    v_mov_b32_e32 v2, s0
3565; GCN-NEXT:    s_and_b32 s8, s2, s3
3566; GCN-NEXT:    s_bfe_u32 s0, s0, 0xf000f
3567; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s0
3568; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s8
3569; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v1
3570; GCN-NEXT:    s_bfe_u32 s2, s2, 0xf000f
3571; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 30
3572; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s2
3573; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3574; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v5
3575; GCN-NEXT:    v_and_b32_e32 v2, s3, v2
3576; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3577; GCN-NEXT:    v_mad_f32 v3, -v4, v1, v3
3578; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
3579; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
3580; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
3581; GCN-NEXT:    v_mul_f32_e32 v1, v6, v7
3582; GCN-NEXT:    v_and_b32_e32 v0, s3, v0
3583; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3584; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
3585; GCN-NEXT:    v_mad_f32 v4, -v1, v5, v6
3586; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
3587; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
3588; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v2
3589; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
3590; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
3591; GCN-NEXT:    v_mul_f32_e32 v1, v0, v6
3592; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3593; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v1
3594; GCN-NEXT:    v_mad_f32 v0, -v1, v2, v0
3595; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
3596; GCN-NEXT:    v_and_b32_e32 v2, s3, v3
3597; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
3598; GCN-NEXT:    v_and_b32_e32 v3, s3, v4
3599; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3600; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3601; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3602; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3603; GCN-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
3604; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3605; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
3606; GCN-NEXT:    s_endpgm
3607  %r = udiv <3 x i15> %x, %y
3608  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3609  ret void
3610}
3611
3612define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3613; CHECK-LABEL: @urem_v3i15(
3614; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3615; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3616; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
3617; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
3618; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3619; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3620; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3621; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3622; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3623; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3624; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3625; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3626; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3627; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3628; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3629; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3630; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3631; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
3632; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
3633; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 32767
3634; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15
3635; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0
3636; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1
3637; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3638; CHECK-NEXT:    [[TMP25:%.*]] = zext i15 [[TMP23]] to i32
3639; CHECK-NEXT:    [[TMP26:%.*]] = zext i15 [[TMP24]] to i32
3640; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
3641; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
3642; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
3643; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
3644; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
3645; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
3646; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
3647; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
3648; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3649; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
3650; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
3651; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
3652; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
3653; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
3654; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
3655; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 32767
3656; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15
3657; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1
3658; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2
3659; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3660; CHECK-NEXT:    [[TMP47:%.*]] = zext i15 [[TMP45]] to i32
3661; CHECK-NEXT:    [[TMP48:%.*]] = zext i15 [[TMP46]] to i32
3662; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
3663; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
3664; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
3665; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
3666; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
3667; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
3668; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
3669; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
3670; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
3671; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
3672; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
3673; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
3674; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
3675; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
3676; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
3677; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 32767
3678; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15
3679; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2
3680; CHECK-NEXT:    store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]]
3681; CHECK-NEXT:    ret void
3682;
3683; GCN-LABEL: urem_v3i15:
3684; GCN:       ; %bb.0:
3685; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3686; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3687; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3688; GCN-NEXT:    s_mov_b32 s7, 0xf000
3689; GCN-NEXT:    s_mov_b32 s6, -1
3690; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3691; GCN-NEXT:    v_mov_b32_e32 v0, s2
3692; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3693; GCN-NEXT:    s_movk_i32 s3, 0x7fff
3694; GCN-NEXT:    s_and_b32 s10, s0, s3
3695; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s10
3696; GCN-NEXT:    s_and_b32 s9, s2, s3
3697; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s9
3698; GCN-NEXT:    v_mov_b32_e32 v2, s0
3699; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v1
3700; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 30
3701; GCN-NEXT:    s_bfe_u32 s1, s0, 0xf000f
3702; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s1
3703; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3704; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3705; GCN-NEXT:    v_mad_f32 v3, -v4, v1, v3
3706; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
3707; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
3708; GCN-NEXT:    s_bfe_u32 s10, s2, 0xf000f
3709; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s10
3710; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
3711; GCN-NEXT:    v_mul_lo_u32 v1, v1, s0
3712; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v5
3713; GCN-NEXT:    v_and_b32_e32 v2, s3, v2
3714; GCN-NEXT:    v_and_b32_e32 v0, s3, v0
3715; GCN-NEXT:    v_sub_i32_e32 v6, vcc, s2, v1
3716; GCN-NEXT:    v_mul_f32_e32 v1, v3, v4
3717; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
3718; GCN-NEXT:    v_cvt_f32_u32_e32 v7, v0
3719; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3720; GCN-NEXT:    v_mad_f32 v3, -v1, v5, v3
3721; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v4
3722; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
3723; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
3724; GCN-NEXT:    s_lshr_b32 s0, s0, 15
3725; GCN-NEXT:    v_mul_f32_e32 v3, v7, v8
3726; GCN-NEXT:    v_trunc_f32_e32 v3, v3
3727; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v3
3728; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3729; GCN-NEXT:    v_mad_f32 v3, -v3, v4, v7
3730; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
3731; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
3732; GCN-NEXT:    v_mul_lo_u32 v1, v1, s0
3733; GCN-NEXT:    v_mul_lo_u32 v2, v3, v2
3734; GCN-NEXT:    s_lshr_b32 s8, s2, 15
3735; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v1
3736; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
3737; GCN-NEXT:    v_and_b32_e32 v3, s3, v3
3738; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3739; GCN-NEXT:    v_and_b32_e32 v2, s3, v6
3740; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3741; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3742; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3743; GCN-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
3744; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3745; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
3746; GCN-NEXT:    s_endpgm
3747  %r = urem <3 x i15> %x, %y
3748  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3749  ret void
3750}
3751
3752define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3753; CHECK-LABEL: @sdiv_v3i15(
3754; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3755; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3756; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
3757; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
3758; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3759; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3760; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3761; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3762; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3763; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3764; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3765; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3766; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3767; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3768; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3769; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3770; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3771; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3772; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3773; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3774; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 17
3775; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 17
3776; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15
3777; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0
3778; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1
3779; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3780; CHECK-NEXT:    [[TMP27:%.*]] = sext i15 [[TMP25]] to i32
3781; CHECK-NEXT:    [[TMP28:%.*]] = sext i15 [[TMP26]] to i32
3782; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
3783; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
3784; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
3785; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
3786; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
3787; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
3788; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
3789; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
3790; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
3791; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
3792; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
3793; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
3794; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3795; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
3796; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
3797; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
3798; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 17
3799; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 17
3800; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15
3801; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1
3802; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2
3803; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3804; CHECK-NEXT:    [[TMP51:%.*]] = sext i15 [[TMP49]] to i32
3805; CHECK-NEXT:    [[TMP52:%.*]] = sext i15 [[TMP50]] to i32
3806; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
3807; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
3808; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
3809; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
3810; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
3811; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
3812; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
3813; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
3814; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
3815; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
3816; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
3817; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
3818; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
3819; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
3820; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
3821; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
3822; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 17
3823; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 17
3824; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15
3825; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2
3826; CHECK-NEXT:    store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]]
3827; CHECK-NEXT:    ret void
3828;
3829; GCN-LABEL: sdiv_v3i15:
3830; GCN:       ; %bb.0:
3831; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3832; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3833; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3834; GCN-NEXT:    s_mov_b32 s7, 0xf000
3835; GCN-NEXT:    s_mov_b32 s6, -1
3836; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3837; GCN-NEXT:    v_mov_b32_e32 v0, s2
3838; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3839; GCN-NEXT:    s_bfe_i32 s3, s0, 0xf0000
3840; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s3
3841; GCN-NEXT:    v_mov_b32_e32 v1, s0
3842; GCN-NEXT:    v_alignbit_b32 v1, s1, v1, 30
3843; GCN-NEXT:    s_bfe_i32 s1, s2, 0xf0000
3844; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s1
3845; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3846; GCN-NEXT:    s_xor_b32 s1, s1, s3
3847; GCN-NEXT:    s_bfe_i32 s0, s0, 0xf000f
3848; GCN-NEXT:    s_ashr_i32 s1, s1, 30
3849; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3850; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3851; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
3852; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
3853; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
3854; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s0
3855; GCN-NEXT:    s_or_b32 s1, s1, 1
3856; GCN-NEXT:    v_mov_b32_e32 v5, s1
3857; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
3858; GCN-NEXT:    s_bfe_i32 s1, s2, 0xf000f
3859; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
3860; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s1
3861; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3862; GCN-NEXT:    s_xor_b32 s0, s1, s0
3863; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 15
3864; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3865; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
3866; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3867; GCN-NEXT:    v_mad_f32 v4, -v5, v3, v4
3868; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
3869; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3870; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v1
3871; GCN-NEXT:    s_or_b32 s0, s0, 1
3872; GCN-NEXT:    v_mov_b32_e32 v6, s0
3873; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
3874; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 15
3875; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
3876; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v0
3877; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3878; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
3879; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
3880; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
3881; GCN-NEXT:    v_mul_f32_e32 v1, v5, v6
3882; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3883; GCN-NEXT:    v_mad_f32 v5, -v1, v4, v5
3884; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
3885; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
3886; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
3887; GCN-NEXT:    s_movk_i32 s0, 0x7fff
3888; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
3889; GCN-NEXT:    v_and_b32_e32 v3, s0, v3
3890; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3891; GCN-NEXT:    v_and_b32_e32 v2, s0, v2
3892; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3893; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3894; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3895; GCN-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
3896; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3897; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
3898; GCN-NEXT:    s_endpgm
3899  %r = sdiv <3 x i15> %x, %y
3900  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3901  ret void
3902}
3903
3904define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3905; CHECK-LABEL: @srem_v3i15(
3906; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3907; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3908; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
3909; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
3910; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3911; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3912; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3913; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3914; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3915; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3916; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3917; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3918; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3919; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3920; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3921; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3922; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3923; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3924; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3925; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3926; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
3927; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
3928; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 17
3929; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 17
3930; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15
3931; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0
3932; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1
3933; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3934; CHECK-NEXT:    [[TMP29:%.*]] = sext i15 [[TMP27]] to i32
3935; CHECK-NEXT:    [[TMP30:%.*]] = sext i15 [[TMP28]] to i32
3936; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
3937; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
3938; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
3939; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
3940; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
3941; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3942; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
3943; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
3944; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
3945; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
3946; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
3947; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
3948; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
3949; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
3950; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
3951; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
3952; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
3953; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
3954; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 17
3955; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 17
3956; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15
3957; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1
3958; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2
3959; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3960; CHECK-NEXT:    [[TMP55:%.*]] = sext i15 [[TMP53]] to i32
3961; CHECK-NEXT:    [[TMP56:%.*]] = sext i15 [[TMP54]] to i32
3962; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
3963; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
3964; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
3965; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
3966; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
3967; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
3968; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
3969; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
3970; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
3971; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
3972; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
3973; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3974; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
3975; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
3976; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
3977; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
3978; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
3979; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
3980; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 17
3981; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 17
3982; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15
3983; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2
3984; CHECK-NEXT:    store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]]
3985; CHECK-NEXT:    ret void
3986;
3987; GCN-LABEL: srem_v3i15:
3988; GCN:       ; %bb.0:
3989; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3990; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3991; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3992; GCN-NEXT:    s_mov_b32 s7, 0xf000
3993; GCN-NEXT:    s_mov_b32 s6, -1
3994; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3995; GCN-NEXT:    v_mov_b32_e32 v0, s2
3996; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3997; GCN-NEXT:    s_movk_i32 s3, 0x7fff
3998; GCN-NEXT:    s_and_b32 s11, s0, s3
3999; GCN-NEXT:    s_bfe_i32 s11, s11, 0xf0000
4000; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s11
4001; GCN-NEXT:    s_and_b32 s9, s2, s3
4002; GCN-NEXT:    s_bfe_i32 s9, s9, 0xf0000
4003; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s9
4004; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
4005; GCN-NEXT:    s_xor_b32 s9, s9, s11
4006; GCN-NEXT:    s_ashr_i32 s9, s9, 30
4007; GCN-NEXT:    s_or_b32 s9, s9, 1
4008; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
4009; GCN-NEXT:    v_trunc_f32_e32 v4, v4
4010; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
4011; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
4012; GCN-NEXT:    v_mov_b32_e32 v5, s9
4013; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
4014; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
4015; GCN-NEXT:    v_mov_b32_e32 v1, s0
4016; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
4017; GCN-NEXT:    s_bfe_u32 s12, s0, 0xf000f
4018; GCN-NEXT:    v_alignbit_b32 v1, s1, v1, 30
4019; GCN-NEXT:    v_mul_lo_u32 v2, v2, s0
4020; GCN-NEXT:    s_lshr_b32 s1, s0, 15
4021; GCN-NEXT:    s_bfe_i32 s0, s12, 0xf0000
4022; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s0
4023; GCN-NEXT:    s_bfe_u32 s10, s2, 0xf000f
4024; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
4025; GCN-NEXT:    s_lshr_b32 s8, s2, 15
4026; GCN-NEXT:    s_bfe_i32 s2, s10, 0xf0000
4027; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s2
4028; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
4029; GCN-NEXT:    s_xor_b32 s0, s2, s0
4030; GCN-NEXT:    s_ashr_i32 s0, s0, 30
4031; GCN-NEXT:    s_or_b32 s0, s0, 1
4032; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
4033; GCN-NEXT:    v_trunc_f32_e32 v5, v5
4034; GCN-NEXT:    v_mad_f32 v4, -v5, v3, v4
4035; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
4036; GCN-NEXT:    v_and_b32_e32 v1, s3, v1
4037; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
4038; GCN-NEXT:    v_mov_b32_e32 v6, s0
4039; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
4040; GCN-NEXT:    v_bfe_i32 v4, v1, 0, 15
4041; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
4042; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v4
4043; GCN-NEXT:    v_and_b32_e32 v0, s3, v0
4044; GCN-NEXT:    v_bfe_i32 v6, v0, 0, 15
4045; GCN-NEXT:    v_cvt_f32_i32_e32 v7, v6
4046; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v5
4047; GCN-NEXT:    v_xor_b32_e32 v4, v6, v4
4048; GCN-NEXT:    v_ashrrev_i32_e32 v4, 30, v4
4049; GCN-NEXT:    v_or_b32_e32 v4, 1, v4
4050; GCN-NEXT:    v_mul_f32_e32 v6, v7, v8
4051; GCN-NEXT:    v_trunc_f32_e32 v6, v6
4052; GCN-NEXT:    v_mad_f32 v7, -v6, v5, v7
4053; GCN-NEXT:    v_cvt_i32_f32_e32 v6, v6
4054; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v5|
4055; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
4056; GCN-NEXT:    v_mul_lo_u32 v3, v3, s1
4057; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
4058; GCN-NEXT:    v_mul_lo_u32 v1, v4, v1
4059; GCN-NEXT:    v_and_b32_e32 v2, s3, v2
4060; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
4061; GCN-NEXT:    v_and_b32_e32 v3, s3, v3
4062; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
4063; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
4064; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
4065; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
4066; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
4067; GCN-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
4068; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4069; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
4070; GCN-NEXT:    s_endpgm
4071  %r = srem <3 x i15> %x, %y
4072  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
4073  ret void
4074}
4075
4076define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
4077; CHECK-LABEL: @udiv_i32_oddk_denom(
4078; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 1235195
4079; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
4080; CHECK-NEXT:    ret void
4081;
4082; GCN-LABEL: udiv_i32_oddk_denom:
4083; GCN:       ; %bb.0:
4084; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4085; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4086; GCN-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
4087; GCN-NEXT:    s_mov_b32 s7, 0xf000
4088; GCN-NEXT:    s_mov_b32 s6, -1
4089; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4090; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
4091; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
4092; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
4093; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
4094; GCN-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
4095; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4096; GCN-NEXT:    s_endpgm
4097  %r = udiv i32 %x, 1235195
4098  store i32 %r, i32 addrspace(1)* %out
4099  ret void
4100}
4101
4102define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
4103; CHECK-LABEL: @udiv_i32_pow2k_denom(
4104; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 4096
4105; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
4106; CHECK-NEXT:    ret void
4107;
4108; GCN-LABEL: udiv_i32_pow2k_denom:
4109; GCN:       ; %bb.0:
4110; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4111; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4112; GCN-NEXT:    s_mov_b32 s7, 0xf000
4113; GCN-NEXT:    s_mov_b32 s6, -1
4114; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4115; GCN-NEXT:    s_lshr_b32 s0, s0, 12
4116; GCN-NEXT:    v_mov_b32_e32 v0, s0
4117; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4118; GCN-NEXT:    s_endpgm
4119  %r = udiv i32 %x, 4096
4120  store i32 %r, i32 addrspace(1)* %out
4121  ret void
4122}
4123
4124define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
4125; CHECK-LABEL: @udiv_i32_pow2_shl_denom(
4126; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
4127; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]]
4128; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
4129; CHECK-NEXT:    ret void
4130;
4131; GCN-LABEL: udiv_i32_pow2_shl_denom:
4132; GCN:       ; %bb.0:
4133; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4134; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4135; GCN-NEXT:    s_mov_b32 s7, 0xf000
4136; GCN-NEXT:    s_mov_b32 s6, -1
4137; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4138; GCN-NEXT:    s_add_i32 s1, s1, 12
4139; GCN-NEXT:    s_lshr_b32 s0, s0, s1
4140; GCN-NEXT:    v_mov_b32_e32 v0, s0
4141; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4142; GCN-NEXT:    s_endpgm
4143  %shl.y = shl i32 4096, %y
4144  %r = udiv i32 %x, %shl.y
4145  store i32 %r, i32 addrspace(1)* %out
4146  ret void
4147}
4148
4149define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4150; CHECK-LABEL: @udiv_v2i32_pow2k_denom(
4151; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4152; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
4153; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4154; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4155; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096
4156; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4157; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]]
4158; CHECK-NEXT:    ret void
4159;
4160; GCN-LABEL: udiv_v2i32_pow2k_denom:
4161; GCN:       ; %bb.0:
4162; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4163; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4164; GCN-NEXT:    s_mov_b32 s7, 0xf000
4165; GCN-NEXT:    s_mov_b32 s6, -1
4166; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4167; GCN-NEXT:    s_lshr_b32 s0, s0, 12
4168; GCN-NEXT:    s_lshr_b32 s1, s1, 12
4169; GCN-NEXT:    v_mov_b32_e32 v0, s0
4170; GCN-NEXT:    v_mov_b32_e32 v1, s1
4171; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4172; GCN-NEXT:    s_endpgm
4173  %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
4174  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4175  ret void
4176}
4177
4178define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4179; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom(
4180; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4181; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
4182; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4183; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4184; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095
4185; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4186; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]]
4187; CHECK-NEXT:    ret void
4188;
4189; GCN-LABEL: udiv_v2i32_mixed_pow2k_denom:
4190; GCN:       ; %bb.0:
4191; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4192; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4193; GCN-NEXT:    v_mov_b32_e32 v0, 0x100101
4194; GCN-NEXT:    s_mov_b32 s7, 0xf000
4195; GCN-NEXT:    s_mov_b32 s6, -1
4196; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4197; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
4198; GCN-NEXT:    s_lshr_b32 s0, s0, 12
4199; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v0
4200; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
4201; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
4202; GCN-NEXT:    v_lshrrev_b32_e32 v1, 11, v0
4203; GCN-NEXT:    v_mov_b32_e32 v0, s0
4204; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4205; GCN-NEXT:    s_endpgm
4206  %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
4207  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4208  ret void
4209}
4210
4211define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
4212; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom(
4213; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
4214; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4215; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
4216; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
4217; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
4218; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41F0000000000000
4219; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
4220; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
4221; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP2]] to i64
4222; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
4223; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
4224; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
4225; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
4226; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP10]]
4227; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
4228; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP10]]
4229; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
4230; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP6]] to i64
4231; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
4232; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
4233; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
4234; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
4235; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP6]], [[TMP21]]
4236; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP6]], [[TMP21]]
4237; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP14]], i32 [[TMP22]], i32 [[TMP23]]
4238; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
4239; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP1]] to i64
4240; CHECK-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP25]], [[TMP26]]
4241; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
4242; CHECK-NEXT:    [[TMP29:%.*]] = lshr i64 [[TMP27]], 32
4243; CHECK-NEXT:    [[TMP30:%.*]] = trunc i64 [[TMP29]] to i32
4244; CHECK-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], [[TMP2]]
4245; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP1]], [[TMP31]]
4246; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP2]]
4247; CHECK-NEXT:    [[TMP34:%.*]] = icmp uge i32 [[TMP1]], [[TMP31]]
4248; CHECK-NEXT:    [[TMP35:%.*]] = and i1 [[TMP33]], [[TMP34]]
4249; CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP30]], 1
4250; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP30]], 1
4251; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP35]], i32 [[TMP36]], i32 [[TMP30]]
4252; CHECK-NEXT:    [[TMP39:%.*]] = select i1 [[TMP34]], i32 [[TMP38]], i32 [[TMP37]]
4253; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <2 x i32> undef, i32 [[TMP39]], i64 0
4254; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[X]], i64 1
4255; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
4256; CHECK-NEXT:    [[TMP43:%.*]] = uitofp i32 [[TMP42]] to float
4257; CHECK-NEXT:    [[TMP44:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP43]])
4258; CHECK-NEXT:    [[TMP45:%.*]] = fmul fast float [[TMP44]], 0x41F0000000000000
4259; CHECK-NEXT:    [[TMP46:%.*]] = fptoui float [[TMP45]] to i32
4260; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP46]] to i64
4261; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP42]] to i64
4262; CHECK-NEXT:    [[TMP49:%.*]] = mul i64 [[TMP47]], [[TMP48]]
4263; CHECK-NEXT:    [[TMP50:%.*]] = trunc i64 [[TMP49]] to i32
4264; CHECK-NEXT:    [[TMP51:%.*]] = lshr i64 [[TMP49]], 32
4265; CHECK-NEXT:    [[TMP52:%.*]] = trunc i64 [[TMP51]] to i32
4266; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 0, [[TMP50]]
4267; CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i32 [[TMP52]], 0
4268; CHECK-NEXT:    [[TMP55:%.*]] = select i1 [[TMP54]], i32 [[TMP53]], i32 [[TMP50]]
4269; CHECK-NEXT:    [[TMP56:%.*]] = zext i32 [[TMP55]] to i64
4270; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP46]] to i64
4271; CHECK-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP56]], [[TMP57]]
4272; CHECK-NEXT:    [[TMP59:%.*]] = trunc i64 [[TMP58]] to i32
4273; CHECK-NEXT:    [[TMP60:%.*]] = lshr i64 [[TMP58]], 32
4274; CHECK-NEXT:    [[TMP61:%.*]] = trunc i64 [[TMP60]] to i32
4275; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP46]], [[TMP61]]
4276; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP46]], [[TMP61]]
4277; CHECK-NEXT:    [[TMP64:%.*]] = select i1 [[TMP54]], i32 [[TMP62]], i32 [[TMP63]]
4278; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP64]] to i64
4279; CHECK-NEXT:    [[TMP66:%.*]] = zext i32 [[TMP41]] to i64
4280; CHECK-NEXT:    [[TMP67:%.*]] = mul i64 [[TMP65]], [[TMP66]]
4281; CHECK-NEXT:    [[TMP68:%.*]] = trunc i64 [[TMP67]] to i32
4282; CHECK-NEXT:    [[TMP69:%.*]] = lshr i64 [[TMP67]], 32
4283; CHECK-NEXT:    [[TMP70:%.*]] = trunc i64 [[TMP69]] to i32
4284; CHECK-NEXT:    [[TMP71:%.*]] = mul i32 [[TMP70]], [[TMP42]]
4285; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP41]], [[TMP71]]
4286; CHECK-NEXT:    [[TMP73:%.*]] = icmp uge i32 [[TMP72]], [[TMP42]]
4287; CHECK-NEXT:    [[TMP74:%.*]] = icmp uge i32 [[TMP41]], [[TMP71]]
4288; CHECK-NEXT:    [[TMP75:%.*]] = and i1 [[TMP73]], [[TMP74]]
4289; CHECK-NEXT:    [[TMP76:%.*]] = add i32 [[TMP70]], 1
4290; CHECK-NEXT:    [[TMP77:%.*]] = sub i32 [[TMP70]], 1
4291; CHECK-NEXT:    [[TMP78:%.*]] = select i1 [[TMP75]], i32 [[TMP76]], i32 [[TMP70]]
4292; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP74]], i32 [[TMP78]], i32 [[TMP77]]
4293; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <2 x i32> [[TMP40]], i32 [[TMP79]], i64 1
4294; CHECK-NEXT:    store <2 x i32> [[TMP80]], <2 x i32> addrspace(1)* [[OUT:%.*]]
4295; CHECK-NEXT:    ret void
4296;
4297; GCN-LABEL: udiv_v2i32_pow2_shl_denom:
4298; GCN:       ; %bb.0:
4299; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4300; GCN-NEXT:    s_movk_i32 s4, 0x1000
4301; GCN-NEXT:    s_mov_b32 s7, 0xf000
4302; GCN-NEXT:    s_mov_b32 s6, -1
4303; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4304; GCN-NEXT:    s_lshl_b32 s2, s4, s2
4305; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
4306; GCN-NEXT:    s_lshl_b32 s10, s4, s3
4307; GCN-NEXT:    s_mov_b32 s3, 0x4f800000
4308; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s10
4309; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4310; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4311; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
4312; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
4313; GCN-NEXT:    v_mul_f32_e32 v0, s3, v0
4314; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4315; GCN-NEXT:    v_mul_f32_e32 v1, s3, v1
4316; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4317; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
4318; GCN-NEXT:    v_mul_hi_u32 v3, v0, s2
4319; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
4320; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
4321; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
4322; GCN-NEXT:    v_mul_hi_u32 v2, v2, v0
4323; GCN-NEXT:    v_mul_lo_u32 v3, v1, s10
4324; GCN-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
4325; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
4326; GCN-NEXT:    v_mul_hi_u32 v2, v1, s10
4327; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
4328; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
4329; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4330; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
4331; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
4332; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
4333; GCN-NEXT:    v_mul_hi_u32 v2, v2, v1
4334; GCN-NEXT:    v_mul_lo_u32 v5, v0, s2
4335; GCN-NEXT:    v_add_i32_e32 v4, vcc, v2, v1
4336; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v2, v1
4337; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
4338; GCN-NEXT:    v_mul_hi_u32 v1, v1, s9
4339; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v5
4340; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s2, v3
4341; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
4342; GCN-NEXT:    v_mul_lo_u32 v4, v1, s10
4343; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s8, v5
4344; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
4345; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
4346; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4347; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s9, v4
4348; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
4349; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
4350; GCN-NEXT:    v_add_i32_e32 v2, vcc, -1, v1
4351; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s9, v4
4352; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
4353; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
4354; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
4355; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[2:3]
4356; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4357; GCN-NEXT:    s_endpgm
4358  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4359  %r = udiv <2 x i32> %x, %shl.y
4360  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4361  ret void
4362}
4363
4364define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
4365; CHECK-LABEL: @urem_i32_oddk_denom(
4366; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 1235195
4367; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
4368; CHECK-NEXT:    ret void
4369;
4370; GCN-LABEL: urem_i32_oddk_denom:
4371; GCN:       ; %bb.0:
4372; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4373; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4374; GCN-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
4375; GCN-NEXT:    s_mov_b32 s7, 0xf000
4376; GCN-NEXT:    s_mov_b32 s6, -1
4377; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4378; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
4379; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
4380; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
4381; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
4382; GCN-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
4383; GCN-NEXT:    v_mul_u32_u24_e32 v0, 0x12d8fb, v0
4384; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4385; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4386; GCN-NEXT:    s_endpgm
4387  %r = urem i32 %x, 1235195
4388  store i32 %r, i32 addrspace(1)* %out
4389  ret void
4390}
4391
4392define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
4393; CHECK-LABEL: @urem_i32_pow2k_denom(
4394; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 4096
4395; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
4396; CHECK-NEXT:    ret void
4397;
4398; GCN-LABEL: urem_i32_pow2k_denom:
4399; GCN:       ; %bb.0:
4400; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4401; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4402; GCN-NEXT:    s_mov_b32 s7, 0xf000
4403; GCN-NEXT:    s_mov_b32 s6, -1
4404; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4405; GCN-NEXT:    s_and_b32 s0, s0, 0xfff
4406; GCN-NEXT:    v_mov_b32_e32 v0, s0
4407; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4408; GCN-NEXT:    s_endpgm
4409  %r = urem i32 %x, 4096
4410  store i32 %r, i32 addrspace(1)* %out
4411  ret void
4412}
4413
4414define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
4415; CHECK-LABEL: @urem_i32_pow2_shl_denom(
4416; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
4417; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]]
4418; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
4419; CHECK-NEXT:    ret void
4420;
4421; GCN-LABEL: urem_i32_pow2_shl_denom:
4422; GCN:       ; %bb.0:
4423; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4424; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4425; GCN-NEXT:    s_mov_b32 s7, 0xf000
4426; GCN-NEXT:    s_mov_b32 s6, -1
4427; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4428; GCN-NEXT:    s_lshl_b32 s1, 0x1000, s1
4429; GCN-NEXT:    s_add_i32 s1, s1, -1
4430; GCN-NEXT:    s_and_b32 s0, s0, s1
4431; GCN-NEXT:    v_mov_b32_e32 v0, s0
4432; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4433; GCN-NEXT:    s_endpgm
4434  %shl.y = shl i32 4096, %y
4435  %r = urem i32 %x, %shl.y
4436  store i32 %r, i32 addrspace(1)* %out
4437  ret void
4438}
4439
4440define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4441; CHECK-LABEL: @urem_v2i32_pow2k_denom(
4442; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4443; CHECK-NEXT:    [[TMP2:%.*]] = urem i32 [[TMP1]], 4096
4444; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4445; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4446; CHECK-NEXT:    [[TMP5:%.*]] = urem i32 [[TMP4]], 4096
4447; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4448; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]]
4449; CHECK-NEXT:    ret void
4450;
4451; GCN-LABEL: urem_v2i32_pow2k_denom:
4452; GCN:       ; %bb.0:
4453; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4454; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4455; GCN-NEXT:    s_movk_i32 s2, 0xfff
4456; GCN-NEXT:    s_mov_b32 s7, 0xf000
4457; GCN-NEXT:    s_mov_b32 s6, -1
4458; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4459; GCN-NEXT:    s_and_b32 s0, s0, s2
4460; GCN-NEXT:    s_and_b32 s1, s1, s2
4461; GCN-NEXT:    v_mov_b32_e32 v0, s0
4462; GCN-NEXT:    v_mov_b32_e32 v1, s1
4463; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4464; GCN-NEXT:    s_endpgm
4465  %r = urem <2 x i32> %x, <i32 4096, i32 4096>
4466  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4467  ret void
4468}
4469
4470define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
4471; CHECK-LABEL: @urem_v2i32_pow2_shl_denom(
4472; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
4473; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4474; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
4475; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
4476; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
4477; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41F0000000000000
4478; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
4479; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
4480; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP2]] to i64
4481; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
4482; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
4483; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
4484; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
4485; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP10]]
4486; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
4487; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP10]]
4488; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
4489; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP6]] to i64
4490; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
4491; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
4492; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
4493; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
4494; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP6]], [[TMP21]]
4495; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP6]], [[TMP21]]
4496; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP14]], i32 [[TMP22]], i32 [[TMP23]]
4497; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
4498; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP1]] to i64
4499; CHECK-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP25]], [[TMP26]]
4500; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
4501; CHECK-NEXT:    [[TMP29:%.*]] = lshr i64 [[TMP27]], 32
4502; CHECK-NEXT:    [[TMP30:%.*]] = trunc i64 [[TMP29]] to i32
4503; CHECK-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], [[TMP2]]
4504; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP1]], [[TMP31]]
4505; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP2]]
4506; CHECK-NEXT:    [[TMP34:%.*]] = icmp uge i32 [[TMP1]], [[TMP31]]
4507; CHECK-NEXT:    [[TMP35:%.*]] = and i1 [[TMP33]], [[TMP34]]
4508; CHECK-NEXT:    [[TMP36:%.*]] = sub i32 [[TMP32]], [[TMP2]]
4509; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP2]]
4510; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP35]], i32 [[TMP36]], i32 [[TMP32]]
4511; CHECK-NEXT:    [[TMP39:%.*]] = select i1 [[TMP34]], i32 [[TMP38]], i32 [[TMP37]]
4512; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <2 x i32> undef, i32 [[TMP39]], i64 0
4513; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[X]], i64 1
4514; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
4515; CHECK-NEXT:    [[TMP43:%.*]] = uitofp i32 [[TMP42]] to float
4516; CHECK-NEXT:    [[TMP44:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP43]])
4517; CHECK-NEXT:    [[TMP45:%.*]] = fmul fast float [[TMP44]], 0x41F0000000000000
4518; CHECK-NEXT:    [[TMP46:%.*]] = fptoui float [[TMP45]] to i32
4519; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP46]] to i64
4520; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP42]] to i64
4521; CHECK-NEXT:    [[TMP49:%.*]] = mul i64 [[TMP47]], [[TMP48]]
4522; CHECK-NEXT:    [[TMP50:%.*]] = trunc i64 [[TMP49]] to i32
4523; CHECK-NEXT:    [[TMP51:%.*]] = lshr i64 [[TMP49]], 32
4524; CHECK-NEXT:    [[TMP52:%.*]] = trunc i64 [[TMP51]] to i32
4525; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 0, [[TMP50]]
4526; CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i32 [[TMP52]], 0
4527; CHECK-NEXT:    [[TMP55:%.*]] = select i1 [[TMP54]], i32 [[TMP53]], i32 [[TMP50]]
4528; CHECK-NEXT:    [[TMP56:%.*]] = zext i32 [[TMP55]] to i64
4529; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP46]] to i64
4530; CHECK-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP56]], [[TMP57]]
4531; CHECK-NEXT:    [[TMP59:%.*]] = trunc i64 [[TMP58]] to i32
4532; CHECK-NEXT:    [[TMP60:%.*]] = lshr i64 [[TMP58]], 32
4533; CHECK-NEXT:    [[TMP61:%.*]] = trunc i64 [[TMP60]] to i32
4534; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP46]], [[TMP61]]
4535; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP46]], [[TMP61]]
4536; CHECK-NEXT:    [[TMP64:%.*]] = select i1 [[TMP54]], i32 [[TMP62]], i32 [[TMP63]]
4537; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP64]] to i64
4538; CHECK-NEXT:    [[TMP66:%.*]] = zext i32 [[TMP41]] to i64
4539; CHECK-NEXT:    [[TMP67:%.*]] = mul i64 [[TMP65]], [[TMP66]]
4540; CHECK-NEXT:    [[TMP68:%.*]] = trunc i64 [[TMP67]] to i32
4541; CHECK-NEXT:    [[TMP69:%.*]] = lshr i64 [[TMP67]], 32
4542; CHECK-NEXT:    [[TMP70:%.*]] = trunc i64 [[TMP69]] to i32
4543; CHECK-NEXT:    [[TMP71:%.*]] = mul i32 [[TMP70]], [[TMP42]]
4544; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP41]], [[TMP71]]
4545; CHECK-NEXT:    [[TMP73:%.*]] = icmp uge i32 [[TMP72]], [[TMP42]]
4546; CHECK-NEXT:    [[TMP74:%.*]] = icmp uge i32 [[TMP41]], [[TMP71]]
4547; CHECK-NEXT:    [[TMP75:%.*]] = and i1 [[TMP73]], [[TMP74]]
4548; CHECK-NEXT:    [[TMP76:%.*]] = sub i32 [[TMP72]], [[TMP42]]
4549; CHECK-NEXT:    [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP42]]
4550; CHECK-NEXT:    [[TMP78:%.*]] = select i1 [[TMP75]], i32 [[TMP76]], i32 [[TMP72]]
4551; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP74]], i32 [[TMP78]], i32 [[TMP77]]
4552; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <2 x i32> [[TMP40]], i32 [[TMP79]], i64 1
4553; CHECK-NEXT:    store <2 x i32> [[TMP80]], <2 x i32> addrspace(1)* [[OUT:%.*]]
4554; CHECK-NEXT:    ret void
4555;
4556; GCN-LABEL: urem_v2i32_pow2_shl_denom:
4557; GCN:       ; %bb.0:
4558; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4559; GCN-NEXT:    s_movk_i32 s4, 0x1000
4560; GCN-NEXT:    s_mov_b32 s7, 0xf000
4561; GCN-NEXT:    s_mov_b32 s6, -1
4562; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4563; GCN-NEXT:    s_lshl_b32 s10, s4, s2
4564; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
4565; GCN-NEXT:    s_mov_b32 s2, 0x4f800000
4566; GCN-NEXT:    s_lshl_b32 s11, s4, s3
4567; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
4568; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4569; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4570; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
4571; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
4572; GCN-NEXT:    v_mul_f32_e32 v0, s2, v0
4573; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4574; GCN-NEXT:    v_mul_f32_e32 v1, s2, v1
4575; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4576; GCN-NEXT:    v_mul_lo_u32 v2, v0, s10
4577; GCN-NEXT:    v_mul_hi_u32 v3, v0, s10
4578; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
4579; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
4580; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
4581; GCN-NEXT:    v_mul_hi_u32 v2, v2, v0
4582; GCN-NEXT:    v_mul_lo_u32 v3, v1, s11
4583; GCN-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
4584; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
4585; GCN-NEXT:    v_mul_hi_u32 v2, v1, s11
4586; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
4587; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
4588; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4589; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
4590; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
4591; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
4592; GCN-NEXT:    v_mul_hi_u32 v2, v2, v1
4593; GCN-NEXT:    v_mul_lo_u32 v0, v0, s10
4594; GCN-NEXT:    v_add_i32_e32 v5, vcc, v2, v1
4595; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v2, v1
4596; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
4597; GCN-NEXT:    v_mul_hi_u32 v1, v1, s9
4598; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v0
4599; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s8, v0
4600; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s10, v3
4601; GCN-NEXT:    v_mul_lo_u32 v1, v1, s11
4602; GCN-NEXT:    v_add_i32_e32 v4, vcc, s10, v3
4603; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s10, v3
4604; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
4605; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
4606; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s9, v1
4607; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[0:1]
4608; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s9, v1
4609; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v2
4610; GCN-NEXT:    v_add_i32_e32 v3, vcc, s11, v2
4611; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s11, v2
4612; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
4613; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
4614; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[2:3]
4615; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4616; GCN-NEXT:    s_endpgm
4617  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4618  %r = urem <2 x i32> %x, %shl.y
4619  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4620  ret void
4621}
4622
4623define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
4624; CHECK-LABEL: @sdiv_i32_oddk_denom(
4625; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195
4626; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
4627; CHECK-NEXT:    ret void
4628;
4629; GCN-LABEL: sdiv_i32_oddk_denom:
4630; GCN:       ; %bb.0:
4631; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4632; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4633; GCN-NEXT:    v_mov_b32_e32 v0, 0xd9528441
4634; GCN-NEXT:    s_mov_b32 s7, 0xf000
4635; GCN-NEXT:    s_mov_b32 s6, -1
4636; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4637; GCN-NEXT:    v_mul_hi_i32 v0, s0, v0
4638; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
4639; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
4640; GCN-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
4641; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4642; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4643; GCN-NEXT:    s_endpgm
4644  %r = sdiv i32 %x, 1235195
4645  store i32 %r, i32 addrspace(1)* %out
4646  ret void
4647}
4648
4649define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
4650; CHECK-LABEL: @sdiv_i32_pow2k_denom(
4651; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 4096
4652; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
4653; CHECK-NEXT:    ret void
4654;
4655; GCN-LABEL: sdiv_i32_pow2k_denom:
4656; GCN:       ; %bb.0:
4657; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4658; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4659; GCN-NEXT:    s_mov_b32 s7, 0xf000
4660; GCN-NEXT:    s_mov_b32 s6, -1
4661; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4662; GCN-NEXT:    s_ashr_i32 s1, s0, 31
4663; GCN-NEXT:    s_lshr_b32 s1, s1, 20
4664; GCN-NEXT:    s_add_i32 s0, s0, s1
4665; GCN-NEXT:    s_ashr_i32 s0, s0, 12
4666; GCN-NEXT:    v_mov_b32_e32 v0, s0
4667; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4668; GCN-NEXT:    s_endpgm
4669  %r = sdiv i32 %x, 4096
4670  store i32 %r, i32 addrspace(1)* %out
4671  ret void
4672}
4673
4674define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
4675; CHECK-LABEL: @sdiv_i32_pow2_shl_denom(
4676; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
4677; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]]
4678; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
4679; CHECK-NEXT:    ret void
4680;
4681; GCN-LABEL: sdiv_i32_pow2_shl_denom:
4682; GCN:       ; %bb.0:
4683; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4684; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4685; GCN-NEXT:    s_mov_b32 s7, 0xf000
4686; GCN-NEXT:    s_mov_b32 s6, -1
4687; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4688; GCN-NEXT:    s_lshl_b32 s3, 0x1000, s3
4689; GCN-NEXT:    s_ashr_i32 s8, s3, 31
4690; GCN-NEXT:    s_add_i32 s3, s3, s8
4691; GCN-NEXT:    s_xor_b32 s9, s3, s8
4692; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
4693; GCN-NEXT:    s_ashr_i32 s3, s2, 31
4694; GCN-NEXT:    s_add_i32 s2, s2, s3
4695; GCN-NEXT:    s_xor_b32 s2, s2, s3
4696; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4697; GCN-NEXT:    s_xor_b32 s3, s3, s8
4698; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
4699; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4700; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
4701; GCN-NEXT:    v_mul_hi_u32 v2, v0, s9
4702; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
4703; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
4704; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
4705; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
4706; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
4707; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
4708; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
4709; GCN-NEXT:    v_mul_hi_u32 v0, v0, s2
4710; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
4711; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
4712; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
4713; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s2, v1
4714; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
4715; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
4716; GCN-NEXT:    s_and_b64 vcc, vcc, s[0:1]
4717; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4718; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
4719; GCN-NEXT:    v_xor_b32_e32 v0, s3, v0
4720; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
4721; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4722; GCN-NEXT:    s_endpgm
4723  %shl.y = shl i32 4096, %y
4724  %r = sdiv i32 %x, %shl.y
4725  store i32 %r, i32 addrspace(1)* %out
4726  ret void
4727}
4728
4729define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4730; CHECK-LABEL: @sdiv_v2i32_pow2k_denom(
4731; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4732; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
4733; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4734; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4735; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096
4736; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4737; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]]
4738; CHECK-NEXT:    ret void
4739;
4740; GCN-LABEL: sdiv_v2i32_pow2k_denom:
4741; GCN:       ; %bb.0:
4742; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4743; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4744; GCN-NEXT:    s_mov_b32 s7, 0xf000
4745; GCN-NEXT:    s_mov_b32 s6, -1
4746; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4747; GCN-NEXT:    s_ashr_i32 s2, s0, 31
4748; GCN-NEXT:    s_lshr_b32 s2, s2, 20
4749; GCN-NEXT:    s_ashr_i32 s3, s1, 31
4750; GCN-NEXT:    s_add_i32 s0, s0, s2
4751; GCN-NEXT:    s_lshr_b32 s2, s3, 20
4752; GCN-NEXT:    s_add_i32 s1, s1, s2
4753; GCN-NEXT:    s_ashr_i32 s0, s0, 12
4754; GCN-NEXT:    s_ashr_i32 s1, s1, 12
4755; GCN-NEXT:    v_mov_b32_e32 v0, s0
4756; GCN-NEXT:    v_mov_b32_e32 v1, s1
4757; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4758; GCN-NEXT:    s_endpgm
4759  %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
4760  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4761  ret void
4762}
4763
4764define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4765; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom(
4766; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4767; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
4768; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4769; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4770; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095
4771; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4772; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]]
4773; CHECK-NEXT:    ret void
4774;
4775; GCN-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
4776; GCN:       ; %bb.0:
4777; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4778; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4779; GCN-NEXT:    v_mov_b32_e32 v0, 0x80080081
4780; GCN-NEXT:    s_mov_b32 s7, 0xf000
4781; GCN-NEXT:    s_mov_b32 s6, -1
4782; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4783; GCN-NEXT:    v_mul_hi_i32 v0, s1, v0
4784; GCN-NEXT:    s_ashr_i32 s2, s0, 31
4785; GCN-NEXT:    s_lshr_b32 s2, s2, 20
4786; GCN-NEXT:    s_add_i32 s0, s0, s2
4787; GCN-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
4788; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
4789; GCN-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
4790; GCN-NEXT:    s_ashr_i32 s0, s0, 12
4791; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
4792; GCN-NEXT:    v_mov_b32_e32 v0, s0
4793; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4794; GCN-NEXT:    s_endpgm
4795  %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
4796  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4797  ret void
4798}
4799
4800define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
4801; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom(
4802; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
4803; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4804; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
4805; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
4806; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
4807; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4808; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
4809; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
4810; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
4811; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
4812; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
4813; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
4814; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41F0000000000000
4815; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
4816; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
4817; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP9]] to i64
4818; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
4819; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
4820; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
4821; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
4822; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 0, [[TMP17]]
4823; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
4824; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 [[TMP17]]
4825; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP22]] to i64
4826; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP13]] to i64
4827; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
4828; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
4829; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
4830; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
4831; CHECK-NEXT:    [[TMP29:%.*]] = add i32 [[TMP13]], [[TMP28]]
4832; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP13]], [[TMP28]]
4833; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP21]], i32 [[TMP29]], i32 [[TMP30]]
4834; CHECK-NEXT:    [[TMP32:%.*]] = zext i32 [[TMP31]] to i64
4835; CHECK-NEXT:    [[TMP33:%.*]] = zext i32 [[TMP8]] to i64
4836; CHECK-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP32]], [[TMP33]]
4837; CHECK-NEXT:    [[TMP35:%.*]] = trunc i64 [[TMP34]] to i32
4838; CHECK-NEXT:    [[TMP36:%.*]] = lshr i64 [[TMP34]], 32
4839; CHECK-NEXT:    [[TMP37:%.*]] = trunc i64 [[TMP36]] to i32
4840; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP9]]
4841; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 [[TMP8]], [[TMP38]]
4842; CHECK-NEXT:    [[TMP40:%.*]] = icmp uge i32 [[TMP39]], [[TMP9]]
4843; CHECK-NEXT:    [[TMP41:%.*]] = icmp uge i32 [[TMP8]], [[TMP38]]
4844; CHECK-NEXT:    [[TMP42:%.*]] = and i1 [[TMP40]], [[TMP41]]
4845; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP37]], 1
4846; CHECK-NEXT:    [[TMP44:%.*]] = sub i32 [[TMP37]], 1
4847; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP42]], i32 [[TMP43]], i32 [[TMP37]]
4848; CHECK-NEXT:    [[TMP46:%.*]] = select i1 [[TMP41]], i32 [[TMP45]], i32 [[TMP44]]
4849; CHECK-NEXT:    [[TMP47:%.*]] = xor i32 [[TMP46]], [[TMP5]]
4850; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP47]], [[TMP5]]
4851; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <2 x i32> undef, i32 [[TMP48]], i64 0
4852; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <2 x i32> [[X]], i64 1
4853; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
4854; CHECK-NEXT:    [[TMP52:%.*]] = ashr i32 [[TMP50]], 31
4855; CHECK-NEXT:    [[TMP53:%.*]] = ashr i32 [[TMP51]], 31
4856; CHECK-NEXT:    [[TMP54:%.*]] = xor i32 [[TMP52]], [[TMP53]]
4857; CHECK-NEXT:    [[TMP55:%.*]] = add i32 [[TMP50]], [[TMP52]]
4858; CHECK-NEXT:    [[TMP56:%.*]] = add i32 [[TMP51]], [[TMP53]]
4859; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP52]]
4860; CHECK-NEXT:    [[TMP58:%.*]] = xor i32 [[TMP56]], [[TMP53]]
4861; CHECK-NEXT:    [[TMP59:%.*]] = uitofp i32 [[TMP58]] to float
4862; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP59]])
4863; CHECK-NEXT:    [[TMP61:%.*]] = fmul fast float [[TMP60]], 0x41F0000000000000
4864; CHECK-NEXT:    [[TMP62:%.*]] = fptoui float [[TMP61]] to i32
4865; CHECK-NEXT:    [[TMP63:%.*]] = zext i32 [[TMP62]] to i64
4866; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP58]] to i64
4867; CHECK-NEXT:    [[TMP65:%.*]] = mul i64 [[TMP63]], [[TMP64]]
4868; CHECK-NEXT:    [[TMP66:%.*]] = trunc i64 [[TMP65]] to i32
4869; CHECK-NEXT:    [[TMP67:%.*]] = lshr i64 [[TMP65]], 32
4870; CHECK-NEXT:    [[TMP68:%.*]] = trunc i64 [[TMP67]] to i32
4871; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 0, [[TMP66]]
4872; CHECK-NEXT:    [[TMP70:%.*]] = icmp eq i32 [[TMP68]], 0
4873; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP69]], i32 [[TMP66]]
4874; CHECK-NEXT:    [[TMP72:%.*]] = zext i32 [[TMP71]] to i64
4875; CHECK-NEXT:    [[TMP73:%.*]] = zext i32 [[TMP62]] to i64
4876; CHECK-NEXT:    [[TMP74:%.*]] = mul i64 [[TMP72]], [[TMP73]]
4877; CHECK-NEXT:    [[TMP75:%.*]] = trunc i64 [[TMP74]] to i32
4878; CHECK-NEXT:    [[TMP76:%.*]] = lshr i64 [[TMP74]], 32
4879; CHECK-NEXT:    [[TMP77:%.*]] = trunc i64 [[TMP76]] to i32
4880; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP62]], [[TMP77]]
4881; CHECK-NEXT:    [[TMP79:%.*]] = sub i32 [[TMP62]], [[TMP77]]
4882; CHECK-NEXT:    [[TMP80:%.*]] = select i1 [[TMP70]], i32 [[TMP78]], i32 [[TMP79]]
4883; CHECK-NEXT:    [[TMP81:%.*]] = zext i32 [[TMP80]] to i64
4884; CHECK-NEXT:    [[TMP82:%.*]] = zext i32 [[TMP57]] to i64
4885; CHECK-NEXT:    [[TMP83:%.*]] = mul i64 [[TMP81]], [[TMP82]]
4886; CHECK-NEXT:    [[TMP84:%.*]] = trunc i64 [[TMP83]] to i32
4887; CHECK-NEXT:    [[TMP85:%.*]] = lshr i64 [[TMP83]], 32
4888; CHECK-NEXT:    [[TMP86:%.*]] = trunc i64 [[TMP85]] to i32
4889; CHECK-NEXT:    [[TMP87:%.*]] = mul i32 [[TMP86]], [[TMP58]]
4890; CHECK-NEXT:    [[TMP88:%.*]] = sub i32 [[TMP57]], [[TMP87]]
4891; CHECK-NEXT:    [[TMP89:%.*]] = icmp uge i32 [[TMP88]], [[TMP58]]
4892; CHECK-NEXT:    [[TMP90:%.*]] = icmp uge i32 [[TMP57]], [[TMP87]]
4893; CHECK-NEXT:    [[TMP91:%.*]] = and i1 [[TMP89]], [[TMP90]]
4894; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP86]], 1
4895; CHECK-NEXT:    [[TMP93:%.*]] = sub i32 [[TMP86]], 1
4896; CHECK-NEXT:    [[TMP94:%.*]] = select i1 [[TMP91]], i32 [[TMP92]], i32 [[TMP86]]
4897; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP90]], i32 [[TMP94]], i32 [[TMP93]]
4898; CHECK-NEXT:    [[TMP96:%.*]] = xor i32 [[TMP95]], [[TMP54]]
4899; CHECK-NEXT:    [[TMP97:%.*]] = sub i32 [[TMP96]], [[TMP54]]
4900; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <2 x i32> [[TMP49]], i32 [[TMP97]], i64 1
4901; CHECK-NEXT:    store <2 x i32> [[TMP98]], <2 x i32> addrspace(1)* [[OUT:%.*]]
4902; CHECK-NEXT:    ret void
4903;
4904; GCN-LABEL: sdiv_v2i32_pow2_shl_denom:
4905; GCN:       ; %bb.0:
4906; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4907; GCN-NEXT:    s_movk_i32 s4, 0x1000
4908; GCN-NEXT:    s_mov_b32 s14, 0x4f800000
4909; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
4910; GCN-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xb
4911; GCN-NEXT:    s_mov_b32 s11, 0xf000
4912; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4913; GCN-NEXT:    s_lshl_b32 s2, s4, s2
4914; GCN-NEXT:    s_ashr_i32 s5, s2, 31
4915; GCN-NEXT:    s_add_i32 s2, s2, s5
4916; GCN-NEXT:    s_xor_b32 s13, s2, s5
4917; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s13
4918; GCN-NEXT:    s_ashr_i32 s2, s6, 31
4919; GCN-NEXT:    s_lshl_b32 s0, s4, s3
4920; GCN-NEXT:    s_add_i32 s1, s6, s2
4921; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4922; GCN-NEXT:    s_ashr_i32 s6, s0, 31
4923; GCN-NEXT:    s_add_i32 s4, s0, s6
4924; GCN-NEXT:    s_xor_b32 s3, s1, s2
4925; GCN-NEXT:    v_mul_f32_e32 v0, s14, v0
4926; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4927; GCN-NEXT:    s_xor_b32 s15, s4, s6
4928; GCN-NEXT:    s_xor_b32 s12, s2, s5
4929; GCN-NEXT:    s_mov_b32 s10, -1
4930; GCN-NEXT:    v_mul_lo_u32 v1, v0, s13
4931; GCN-NEXT:    v_mul_hi_u32 v2, v0, s13
4932; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
4933; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
4934; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
4935; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
4936; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s15
4937; GCN-NEXT:    v_add_i32_e32 v3, vcc, v1, v0
4938; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
4939; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
4940; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v2
4941; GCN-NEXT:    v_mul_hi_u32 v0, v0, s3
4942; GCN-NEXT:    v_mul_f32_e32 v1, s14, v1
4943; GCN-NEXT:    v_mul_lo_u32 v2, v0, s13
4944; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4945; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
4946; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
4947; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
4948; GCN-NEXT:    v_mul_lo_u32 v4, v1, s15
4949; GCN-NEXT:    v_mul_hi_u32 v5, v1, s15
4950; GCN-NEXT:    s_ashr_i32 s13, s7, 31
4951; GCN-NEXT:    s_add_i32 s7, s7, s13
4952; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v4
4953; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
4954; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[4:5]
4955; GCN-NEXT:    v_mul_hi_u32 v4, v4, v1
4956; GCN-NEXT:    s_xor_b32 s7, s7, s13
4957; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s3, v2
4958; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
4959; GCN-NEXT:    v_add_i32_e32 v5, vcc, v4, v1
4960; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v4, v1
4961; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
4962; GCN-NEXT:    v_mul_hi_u32 v1, v1, s7
4963; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
4964; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4965; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[2:3]
4966; GCN-NEXT:    v_mul_lo_u32 v2, v1, s15
4967; GCN-NEXT:    v_xor_b32_e32 v0, s12, v0
4968; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
4969; GCN-NEXT:    s_xor_b32 s4, s13, s6
4970; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s7, v2
4971; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v3
4972; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s7, v2
4973; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v1
4974; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
4975; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
4976; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4977; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[2:3]
4978; GCN-NEXT:    v_xor_b32_e32 v1, s4, v1
4979; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v1
4980; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
4981; GCN-NEXT:    s_endpgm
4982  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4983  %r = sdiv <2 x i32> %x, %shl.y
4984  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4985  ret void
4986}
4987
4988define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
4989; CHECK-LABEL: @srem_i32_oddk_denom(
4990; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 1235195
4991; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
4992; CHECK-NEXT:    ret void
4993;
4994; GCN-LABEL: srem_i32_oddk_denom:
4995; GCN:       ; %bb.0:
4996; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4997; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4998; GCN-NEXT:    v_mov_b32_e32 v0, 0xd9528441
4999; GCN-NEXT:    s_mov_b32 s7, 0xf000
5000; GCN-NEXT:    s_mov_b32 s6, -1
5001; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5002; GCN-NEXT:    v_mul_hi_i32 v0, s0, v0
5003; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
5004; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
5005; GCN-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
5006; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
5007; GCN-NEXT:    v_mul_i32_i24_e32 v0, 0x12d8fb, v0
5008; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
5009; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5010; GCN-NEXT:    s_endpgm
5011  %r = srem i32 %x, 1235195
5012  store i32 %r, i32 addrspace(1)* %out
5013  ret void
5014}
5015
5016define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
5017; CHECK-LABEL: @srem_i32_pow2k_denom(
5018; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 4096
5019; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
5020; CHECK-NEXT:    ret void
5021;
5022; GCN-LABEL: srem_i32_pow2k_denom:
5023; GCN:       ; %bb.0:
5024; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5025; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
5026; GCN-NEXT:    s_mov_b32 s7, 0xf000
5027; GCN-NEXT:    s_mov_b32 s6, -1
5028; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5029; GCN-NEXT:    s_ashr_i32 s1, s0, 31
5030; GCN-NEXT:    s_lshr_b32 s1, s1, 20
5031; GCN-NEXT:    s_add_i32 s1, s0, s1
5032; GCN-NEXT:    s_and_b32 s1, s1, 0xfffff000
5033; GCN-NEXT:    s_sub_i32 s0, s0, s1
5034; GCN-NEXT:    v_mov_b32_e32 v0, s0
5035; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5036; GCN-NEXT:    s_endpgm
5037  %r = srem i32 %x, 4096
5038  store i32 %r, i32 addrspace(1)* %out
5039  ret void
5040}
5041
5042define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
5043; CHECK-LABEL: @srem_i32_pow2_shl_denom(
5044; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
5045; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]]
5046; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
5047; CHECK-NEXT:    ret void
5048;
5049; GCN-LABEL: srem_i32_pow2_shl_denom:
5050; GCN:       ; %bb.0:
5051; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
5052; GCN-NEXT:    s_mov_b32 s7, 0xf000
5053; GCN-NEXT:    s_mov_b32 s6, -1
5054; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5055; GCN-NEXT:    s_lshl_b32 s2, 0x1000, s5
5056; GCN-NEXT:    s_ashr_i32 s3, s2, 31
5057; GCN-NEXT:    s_add_i32 s2, s2, s3
5058; GCN-NEXT:    s_xor_b32 s10, s2, s3
5059; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
5060; GCN-NEXT:    s_ashr_i32 s8, s4, 31
5061; GCN-NEXT:    s_add_i32 s4, s4, s8
5062; GCN-NEXT:    s_xor_b32 s9, s4, s8
5063; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
5064; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5065; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
5066; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5067; GCN-NEXT:    v_mul_lo_u32 v1, v0, s10
5068; GCN-NEXT:    v_mul_hi_u32 v2, v0, s10
5069; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
5070; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v2
5071; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
5072; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
5073; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
5074; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
5075; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
5076; GCN-NEXT:    v_mul_hi_u32 v0, v0, s9
5077; GCN-NEXT:    v_mul_lo_u32 v0, v0, s10
5078; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s9, v0
5079; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s9, v0
5080; GCN-NEXT:    v_add_i32_e32 v2, vcc, s10, v1
5081; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s10, v1
5082; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s10, v1
5083; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
5084; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5085; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
5086; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
5087; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
5088; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5089; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5090; GCN-NEXT:    s_endpgm
5091  %shl.y = shl i32 4096, %y
5092  %r = srem i32 %x, %shl.y
5093  store i32 %r, i32 addrspace(1)* %out
5094  ret void
5095}
5096
5097define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
5098; CHECK-LABEL: @srem_v2i32_pow2k_denom(
5099; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5100; CHECK-NEXT:    [[TMP2:%.*]] = srem i32 [[TMP1]], 4096
5101; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
5102; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5103; CHECK-NEXT:    [[TMP5:%.*]] = srem i32 [[TMP4]], 4096
5104; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5105; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]]
5106; CHECK-NEXT:    ret void
5107;
5108; GCN-LABEL: srem_v2i32_pow2k_denom:
5109; GCN:       ; %bb.0:
5110; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5111; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5112; GCN-NEXT:    s_movk_i32 s2, 0xf000
5113; GCN-NEXT:    s_mov_b32 s7, 0xf000
5114; GCN-NEXT:    s_mov_b32 s6, -1
5115; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5116; GCN-NEXT:    s_ashr_i32 s3, s0, 31
5117; GCN-NEXT:    s_lshr_b32 s3, s3, 20
5118; GCN-NEXT:    s_add_i32 s3, s0, s3
5119; GCN-NEXT:    s_and_b32 s3, s3, s2
5120; GCN-NEXT:    s_sub_i32 s0, s0, s3
5121; GCN-NEXT:    s_ashr_i32 s3, s1, 31
5122; GCN-NEXT:    s_lshr_b32 s3, s3, 20
5123; GCN-NEXT:    s_add_i32 s3, s1, s3
5124; GCN-NEXT:    s_and_b32 s2, s3, s2
5125; GCN-NEXT:    s_sub_i32 s1, s1, s2
5126; GCN-NEXT:    v_mov_b32_e32 v0, s0
5127; GCN-NEXT:    v_mov_b32_e32 v1, s1
5128; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5129; GCN-NEXT:    s_endpgm
5130  %r = srem <2 x i32> %x, <i32 4096, i32 4096>
5131  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5132  ret void
5133}
5134
5135define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
5136; CHECK-LABEL: @srem_v2i32_pow2_shl_denom(
5137; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
5138; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5139; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
5140; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
5141; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
5142; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
5143; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
5144; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
5145; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
5146; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
5147; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
5148; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41F0000000000000
5149; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
5150; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP12]] to i64
5151; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP8]] to i64
5152; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]]
5153; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
5154; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP15]], 32
5155; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
5156; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 0, [[TMP16]]
5157; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], 0
5158; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP19]], i32 [[TMP16]]
5159; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP21]] to i64
5160; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP12]] to i64
5161; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
5162; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
5163; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
5164; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
5165; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP12]], [[TMP27]]
5166; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP12]], [[TMP27]]
5167; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP20]], i32 [[TMP28]], i32 [[TMP29]]
5168; CHECK-NEXT:    [[TMP31:%.*]] = zext i32 [[TMP30]] to i64
5169; CHECK-NEXT:    [[TMP32:%.*]] = zext i32 [[TMP7]] to i64
5170; CHECK-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP31]], [[TMP32]]
5171; CHECK-NEXT:    [[TMP34:%.*]] = trunc i64 [[TMP33]] to i32
5172; CHECK-NEXT:    [[TMP35:%.*]] = lshr i64 [[TMP33]], 32
5173; CHECK-NEXT:    [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
5174; CHECK-NEXT:    [[TMP37:%.*]] = mul i32 [[TMP36]], [[TMP8]]
5175; CHECK-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP7]], [[TMP37]]
5176; CHECK-NEXT:    [[TMP39:%.*]] = icmp uge i32 [[TMP38]], [[TMP8]]
5177; CHECK-NEXT:    [[TMP40:%.*]] = icmp uge i32 [[TMP7]], [[TMP37]]
5178; CHECK-NEXT:    [[TMP41:%.*]] = and i1 [[TMP39]], [[TMP40]]
5179; CHECK-NEXT:    [[TMP42:%.*]] = sub i32 [[TMP38]], [[TMP8]]
5180; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP38]], [[TMP8]]
5181; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP41]], i32 [[TMP42]], i32 [[TMP38]]
5182; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP40]], i32 [[TMP44]], i32 [[TMP43]]
5183; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP45]], [[TMP3]]
5184; CHECK-NEXT:    [[TMP47:%.*]] = sub i32 [[TMP46]], [[TMP3]]
5185; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <2 x i32> undef, i32 [[TMP47]], i64 0
5186; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x i32> [[X]], i64 1
5187; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
5188; CHECK-NEXT:    [[TMP51:%.*]] = ashr i32 [[TMP49]], 31
5189; CHECK-NEXT:    [[TMP52:%.*]] = ashr i32 [[TMP50]], 31
5190; CHECK-NEXT:    [[TMP53:%.*]] = add i32 [[TMP49]], [[TMP51]]
5191; CHECK-NEXT:    [[TMP54:%.*]] = add i32 [[TMP50]], [[TMP52]]
5192; CHECK-NEXT:    [[TMP55:%.*]] = xor i32 [[TMP53]], [[TMP51]]
5193; CHECK-NEXT:    [[TMP56:%.*]] = xor i32 [[TMP54]], [[TMP52]]
5194; CHECK-NEXT:    [[TMP57:%.*]] = uitofp i32 [[TMP56]] to float
5195; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
5196; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP58]], 0x41F0000000000000
5197; CHECK-NEXT:    [[TMP60:%.*]] = fptoui float [[TMP59]] to i32
5198; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP60]] to i64
5199; CHECK-NEXT:    [[TMP62:%.*]] = zext i32 [[TMP56]] to i64
5200; CHECK-NEXT:    [[TMP63:%.*]] = mul i64 [[TMP61]], [[TMP62]]
5201; CHECK-NEXT:    [[TMP64:%.*]] = trunc i64 [[TMP63]] to i32
5202; CHECK-NEXT:    [[TMP65:%.*]] = lshr i64 [[TMP63]], 32
5203; CHECK-NEXT:    [[TMP66:%.*]] = trunc i64 [[TMP65]] to i32
5204; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 0, [[TMP64]]
5205; CHECK-NEXT:    [[TMP68:%.*]] = icmp eq i32 [[TMP66]], 0
5206; CHECK-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], i32 [[TMP67]], i32 [[TMP64]]
5207; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i64
5208; CHECK-NEXT:    [[TMP71:%.*]] = zext i32 [[TMP60]] to i64
5209; CHECK-NEXT:    [[TMP72:%.*]] = mul i64 [[TMP70]], [[TMP71]]
5210; CHECK-NEXT:    [[TMP73:%.*]] = trunc i64 [[TMP72]] to i32
5211; CHECK-NEXT:    [[TMP74:%.*]] = lshr i64 [[TMP72]], 32
5212; CHECK-NEXT:    [[TMP75:%.*]] = trunc i64 [[TMP74]] to i32
5213; CHECK-NEXT:    [[TMP76:%.*]] = add i32 [[TMP60]], [[TMP75]]
5214; CHECK-NEXT:    [[TMP77:%.*]] = sub i32 [[TMP60]], [[TMP75]]
5215; CHECK-NEXT:    [[TMP78:%.*]] = select i1 [[TMP68]], i32 [[TMP76]], i32 [[TMP77]]
5216; CHECK-NEXT:    [[TMP79:%.*]] = zext i32 [[TMP78]] to i64
5217; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP55]] to i64
5218; CHECK-NEXT:    [[TMP81:%.*]] = mul i64 [[TMP79]], [[TMP80]]
5219; CHECK-NEXT:    [[TMP82:%.*]] = trunc i64 [[TMP81]] to i32
5220; CHECK-NEXT:    [[TMP83:%.*]] = lshr i64 [[TMP81]], 32
5221; CHECK-NEXT:    [[TMP84:%.*]] = trunc i64 [[TMP83]] to i32
5222; CHECK-NEXT:    [[TMP85:%.*]] = mul i32 [[TMP84]], [[TMP56]]
5223; CHECK-NEXT:    [[TMP86:%.*]] = sub i32 [[TMP55]], [[TMP85]]
5224; CHECK-NEXT:    [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP56]]
5225; CHECK-NEXT:    [[TMP88:%.*]] = icmp uge i32 [[TMP55]], [[TMP85]]
5226; CHECK-NEXT:    [[TMP89:%.*]] = and i1 [[TMP87]], [[TMP88]]
5227; CHECK-NEXT:    [[TMP90:%.*]] = sub i32 [[TMP86]], [[TMP56]]
5228; CHECK-NEXT:    [[TMP91:%.*]] = add i32 [[TMP86]], [[TMP56]]
5229; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP89]], i32 [[TMP90]], i32 [[TMP86]]
5230; CHECK-NEXT:    [[TMP93:%.*]] = select i1 [[TMP88]], i32 [[TMP92]], i32 [[TMP91]]
5231; CHECK-NEXT:    [[TMP94:%.*]] = xor i32 [[TMP93]], [[TMP51]]
5232; CHECK-NEXT:    [[TMP95:%.*]] = sub i32 [[TMP94]], [[TMP51]]
5233; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <2 x i32> [[TMP48]], i32 [[TMP95]], i64 1
5234; CHECK-NEXT:    store <2 x i32> [[TMP96]], <2 x i32> addrspace(1)* [[OUT:%.*]]
5235; CHECK-NEXT:    ret void
5236;
5237; GCN-LABEL: srem_v2i32_pow2_shl_denom:
5238; GCN:       ; %bb.0:
5239; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
5240; GCN-NEXT:    s_movk_i32 s4, 0x1000
5241; GCN-NEXT:    s_mov_b32 s14, 0x4f800000
5242; GCN-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xb
5243; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
5244; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5245; GCN-NEXT:    s_lshl_b32 s2, s4, s2
5246; GCN-NEXT:    s_ashr_i32 s5, s2, 31
5247; GCN-NEXT:    s_add_i32 s2, s2, s5
5248; GCN-NEXT:    s_xor_b32 s13, s2, s5
5249; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s13
5250; GCN-NEXT:    s_lshl_b32 s2, s4, s3
5251; GCN-NEXT:    s_ashr_i32 s12, s6, 31
5252; GCN-NEXT:    s_add_i32 s3, s6, s12
5253; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
5254; GCN-NEXT:    s_ashr_i32 s4, s2, 31
5255; GCN-NEXT:    s_add_i32 s6, s2, s4
5256; GCN-NEXT:    s_xor_b32 s5, s3, s12
5257; GCN-NEXT:    v_mul_f32_e32 v0, s14, v0
5258; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5259; GCN-NEXT:    s_xor_b32 s15, s6, s4
5260; GCN-NEXT:    s_ashr_i32 s6, s7, 31
5261; GCN-NEXT:    s_add_i32 s7, s7, s6
5262; GCN-NEXT:    v_mul_lo_u32 v1, v0, s13
5263; GCN-NEXT:    v_mul_hi_u32 v2, v0, s13
5264; GCN-NEXT:    s_xor_b32 s7, s7, s6
5265; GCN-NEXT:    s_mov_b32 s11, 0xf000
5266; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
5267; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v2
5268; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
5269; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
5270; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s15
5271; GCN-NEXT:    s_mov_b32 s10, -1
5272; GCN-NEXT:    v_add_i32_e32 v3, vcc, v1, v0
5273; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
5274; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v2
5275; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[2:3]
5276; GCN-NEXT:    v_mul_hi_u32 v0, v0, s5
5277; GCN-NEXT:    v_mul_f32_e32 v1, s14, v1
5278; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5279; GCN-NEXT:    v_mul_lo_u32 v0, v0, s13
5280; GCN-NEXT:    v_mul_lo_u32 v4, v1, s15
5281; GCN-NEXT:    v_mul_hi_u32 v5, v1, s15
5282; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s5, v0
5283; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s5, v0
5284; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v4
5285; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
5286; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[4:5]
5287; GCN-NEXT:    v_mul_hi_u32 v4, v4, v1
5288; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v2
5289; GCN-NEXT:    v_add_i32_e32 v3, vcc, s13, v2
5290; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s13, v2
5291; GCN-NEXT:    v_add_i32_e32 v5, vcc, v4, v1
5292; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v4, v1
5293; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
5294; GCN-NEXT:    v_mul_hi_u32 v1, v1, s7
5295; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
5296; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
5297; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[2:3]
5298; GCN-NEXT:    v_mul_lo_u32 v1, v1, s15
5299; GCN-NEXT:    v_xor_b32_e32 v0, s12, v0
5300; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
5301; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
5302; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s7, v1
5303; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v2
5304; GCN-NEXT:    v_add_i32_e32 v3, vcc, s15, v2
5305; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s15, v2
5306; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
5307; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5308; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[2:3]
5309; GCN-NEXT:    v_xor_b32_e32 v1, s6, v1
5310; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v1
5311; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
5312; GCN-NEXT:    s_endpgm
5313  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
5314  %r = srem <2 x i32> %x, %shl.y
5315  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5316  ret void
5317}
5318
5319define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
5320; CHECK-LABEL: @udiv_i64_oddk_denom(
5321; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
5322; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
5323; CHECK-NEXT:    ret void
5324;
5325; GCN-LABEL: udiv_i64_oddk_denom:
5326; GCN:       ; %bb.0:
5327; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
5328; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
5329; GCN-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
5330; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5331; GCN-NEXT:    s_movk_i32 s2, 0xfee0
5332; GCN-NEXT:    s_mov_b32 s3, 0x68958c89
5333; GCN-NEXT:    v_mov_b32_e32 v8, 0
5334; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5335; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5336; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5337; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5338; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5339; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5340; GCN-NEXT:    v_mov_b32_e32 v7, 0
5341; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5342; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
5343; GCN-NEXT:    v_mul_hi_u32 v3, v0, s3
5344; GCN-NEXT:    v_mul_lo_u32 v4, v1, s3
5345; GCN-NEXT:    s_mov_b32 s11, 0xf000
5346; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5347; GCN-NEXT:    s_mov_b32 s8, s4
5348; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5349; GCN-NEXT:    v_mul_lo_u32 v3, v0, s3
5350; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5351; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
5352; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
5353; GCN-NEXT:    v_mul_hi_u32 v6, v0, v3
5354; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
5355; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
5356; GCN-NEXT:    s_movk_i32 s4, 0x11e
5357; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5358; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
5359; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
5360; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
5361; GCN-NEXT:    s_mov_b32 s10, -1
5362; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5363; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
5364; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
5365; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5366; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
5367; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5368; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
5369; GCN-NEXT:    v_mul_hi_u32 v5, v0, s3
5370; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
5371; GCN-NEXT:    v_mul_lo_u32 v6, v2, s3
5372; GCN-NEXT:    s_mov_b32 s2, 0x976a7377
5373; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
5374; GCN-NEXT:    v_mul_lo_u32 v5, v0, s3
5375; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
5376; GCN-NEXT:    v_mul_lo_u32 v6, v0, v4
5377; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
5378; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
5379; GCN-NEXT:    v_mul_hi_u32 v11, v2, v4
5380; GCN-NEXT:    s_movk_i32 s3, 0x11f
5381; GCN-NEXT:    s_mov_b32 s9, s5
5382; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
5383; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
5384; GCN-NEXT:    v_mul_lo_u32 v10, v2, v5
5385; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
5386; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
5387; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
5388; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
5389; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
5390; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
5391; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
5392; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5393; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
5394; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5395; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5396; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
5397; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
5398; GCN-NEXT:    v_mul_hi_u32 v4, s6, v1
5399; GCN-NEXT:    v_mul_hi_u32 v5, s7, v1
5400; GCN-NEXT:    v_mul_lo_u32 v1, s7, v1
5401; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5402; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5403; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
5404; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
5405; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5406; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
5407; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
5408; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5409; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
5410; GCN-NEXT:    v_mul_lo_u32 v2, v0, s3
5411; GCN-NEXT:    v_mul_hi_u32 v3, v0, s2
5412; GCN-NEXT:    v_mul_lo_u32 v4, v1, s2
5413; GCN-NEXT:    v_mov_b32_e32 v5, s3
5414; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5415; GCN-NEXT:    v_mul_lo_u32 v3, v0, s2
5416; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
5417; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s7, v2
5418; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
5419; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
5420; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
5421; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
5422; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s4, v4
5423; GCN-NEXT:    s_mov_b32 s2, 0x976a7376
5424; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
5425; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s2, v5
5426; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
5427; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
5428; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
5429; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
5430; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
5431; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
5432; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
5433; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
5434; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
5435; GCN-NEXT:    v_mov_b32_e32 v6, s7
5436; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
5437; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
5438; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
5439; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v3
5440; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
5441; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
5442; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
5443; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5444; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
5445; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
5446; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5447; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
5448; GCN-NEXT:    s_endpgm
5449  %r = udiv i64 %x, 1235195949943
5450  store i64 %r, i64 addrspace(1)* %out
5451  ret void
5452}
5453
5454define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
5455; CHECK-LABEL: @udiv_i64_pow2k_denom(
5456; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
5457; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
5458; CHECK-NEXT:    ret void
5459;
5460; GCN-LABEL: udiv_i64_pow2k_denom:
5461; GCN:       ; %bb.0:
5462; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5463; GCN-NEXT:    s_mov_b32 s3, 0xf000
5464; GCN-NEXT:    s_mov_b32 s2, -1
5465; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5466; GCN-NEXT:    s_mov_b32 s0, s4
5467; GCN-NEXT:    s_mov_b32 s1, s5
5468; GCN-NEXT:    s_lshr_b64 s[4:5], s[6:7], 12
5469; GCN-NEXT:    v_mov_b32_e32 v0, s4
5470; GCN-NEXT:    v_mov_b32_e32 v1, s5
5471; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5472; GCN-NEXT:    s_endpgm
5473  %r = udiv i64 %x, 4096
5474  store i64 %r, i64 addrspace(1)* %out
5475  ret void
5476}
5477
5478define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
5479; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
5480; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
5481; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
5482; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
5483; CHECK-NEXT:    ret void
5484;
5485; GCN-LABEL: udiv_i64_pow2_shl_denom:
5486; GCN:       ; %bb.0:
5487; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5488; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
5489; GCN-NEXT:    s_mov_b32 s3, 0xf000
5490; GCN-NEXT:    s_mov_b32 s2, -1
5491; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5492; GCN-NEXT:    s_mov_b32 s0, s4
5493; GCN-NEXT:    s_add_i32 s8, s8, 12
5494; GCN-NEXT:    s_mov_b32 s1, s5
5495; GCN-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
5496; GCN-NEXT:    v_mov_b32_e32 v0, s4
5497; GCN-NEXT:    v_mov_b32_e32 v1, s5
5498; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5499; GCN-NEXT:    s_endpgm
5500  %shl.y = shl i64 4096, %y
5501  %r = udiv i64 %x, %shl.y
5502  store i64 %r, i64 addrspace(1)* %out
5503  ret void
5504}
5505
5506define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5507; CHECK-LABEL: @udiv_v2i64_pow2k_denom(
5508; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5509; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
5510; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5511; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5512; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096
5513; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5514; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]]
5515; CHECK-NEXT:    ret void
5516;
5517; GCN-LABEL: udiv_v2i64_pow2k_denom:
5518; GCN:       ; %bb.0:
5519; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5520; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
5521; GCN-NEXT:    s_mov_b32 s7, 0xf000
5522; GCN-NEXT:    s_mov_b32 s6, -1
5523; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5524; GCN-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
5525; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
5526; GCN-NEXT:    v_mov_b32_e32 v0, s0
5527; GCN-NEXT:    v_mov_b32_e32 v1, s1
5528; GCN-NEXT:    v_mov_b32_e32 v2, s2
5529; GCN-NEXT:    v_mov_b32_e32 v3, s3
5530; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5531; GCN-NEXT:    s_endpgm
5532  %r = udiv <2 x i64> %x, <i64 4096, i64 4096>
5533  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5534  ret void
5535}
5536
5537define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5538; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom(
5539; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5540; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
5541; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5542; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5543; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095
5544; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5545; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]]
5546; CHECK-NEXT:    ret void
5547;
5548; GCN-LABEL: udiv_v2i64_mixed_pow2k_denom:
5549; GCN:       ; %bb.0:
5550; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
5551; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
5552; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5553; GCN-NEXT:    s_movk_i32 s6, 0xf001
5554; GCN-NEXT:    v_mov_b32_e32 v7, 0
5555; GCN-NEXT:    v_mov_b32_e32 v2, 0
5556; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5557; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5558; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5559; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5560; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5561; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5562; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5563; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5564; GCN-NEXT:    s_movk_i32 s0, 0xfff
5565; GCN-NEXT:    v_mul_hi_u32 v3, v0, s6
5566; GCN-NEXT:    v_mul_lo_u32 v5, v1, s6
5567; GCN-NEXT:    v_mul_lo_u32 v4, v0, s6
5568; GCN-NEXT:    s_mov_b32 s7, 0xf000
5569; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v3
5570; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
5571; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
5572; GCN-NEXT:    v_mul_lo_u32 v5, v0, v3
5573; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
5574; GCN-NEXT:    v_mul_hi_u32 v9, v1, v3
5575; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
5576; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5577; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
5578; GCN-NEXT:    v_mul_lo_u32 v8, v1, v4
5579; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
5580; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
5581; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
5582; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v2, vcc
5583; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
5584; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v3
5585; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
5586; GCN-NEXT:    v_mul_hi_u32 v5, v0, s6
5587; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v1, v4, s[2:3]
5588; GCN-NEXT:    v_mul_lo_u32 v6, v3, s6
5589; GCN-NEXT:    v_mul_lo_u32 v8, v0, s6
5590; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
5591; GCN-NEXT:    s_mov_b32 s6, -1
5592; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
5593; GCN-NEXT:    v_mul_lo_u32 v6, v0, v5
5594; GCN-NEXT:    v_mul_hi_u32 v9, v0, v8
5595; GCN-NEXT:    v_mul_hi_u32 v10, v0, v5
5596; GCN-NEXT:    v_mul_hi_u32 v11, v3, v5
5597; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
5598; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v7, v10, vcc
5599; GCN-NEXT:    v_mul_lo_u32 v10, v3, v8
5600; GCN-NEXT:    v_mul_hi_u32 v8, v3, v8
5601; GCN-NEXT:    v_mul_lo_u32 v3, v3, v5
5602; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
5603; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v8, vcc
5604; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v2, vcc
5605; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
5606; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
5607; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
5608; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
5609; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
5610; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5611; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5612; GCN-NEXT:    v_mul_lo_u32 v3, s10, v1
5613; GCN-NEXT:    v_mul_hi_u32 v4, s10, v0
5614; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
5615; GCN-NEXT:    v_mul_hi_u32 v6, s11, v1
5616; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
5617; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
5618; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
5619; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
5620; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
5621; GCN-NEXT:    s_lshr_b64 s[2:3], s[8:9], 12
5622; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
5623; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
5624; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v6, v2, vcc
5625; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5626; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v2, vcc
5627; GCN-NEXT:    v_mul_lo_u32 v2, v1, s0
5628; GCN-NEXT:    v_mul_hi_u32 v3, v0, s0
5629; GCN-NEXT:    v_mul_lo_u32 v4, v0, s0
5630; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5631; GCN-NEXT:    v_mov_b32_e32 v3, s11
5632; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s10, v4
5633; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
5634; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s0, v4
5635; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
5636; GCN-NEXT:    s_movk_i32 s0, 0xffe
5637; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
5638; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
5639; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
5640; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
5641; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
5642; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
5643; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
5644; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
5645; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
5646; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
5647; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
5648; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
5649; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
5650; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
5651; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
5652; GCN-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
5653; GCN-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
5654; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
5655; GCN-NEXT:    v_mov_b32_e32 v0, s2
5656; GCN-NEXT:    v_mov_b32_e32 v1, s3
5657; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5658; GCN-NEXT:    s_endpgm
5659  %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
5660  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5661  ret void
5662}
5663
5664define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
5665; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom(
5666; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
5667; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5668; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
5669; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]]
5670; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
5671; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
5672; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
5673; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
5674; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
5675; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]]
5676; CHECK-NEXT:    ret void
5677;
5678; GCN-LABEL: udiv_v2i64_pow2_shl_denom:
5679; GCN:       ; %bb.0:
5680; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5681; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5682; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
5683; GCN-NEXT:    s_mov_b32 s7, 0xf000
5684; GCN-NEXT:    s_mov_b32 s6, -1
5685; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5686; GCN-NEXT:    s_add_i32 s0, s0, 12
5687; GCN-NEXT:    s_add_i32 s2, s2, 12
5688; GCN-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
5689; GCN-NEXT:    s_lshr_b64 s[2:3], s[10:11], s2
5690; GCN-NEXT:    v_mov_b32_e32 v0, s0
5691; GCN-NEXT:    v_mov_b32_e32 v1, s1
5692; GCN-NEXT:    v_mov_b32_e32 v2, s2
5693; GCN-NEXT:    v_mov_b32_e32 v3, s3
5694; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5695; GCN-NEXT:    s_endpgm
5696  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
5697  %r = udiv <2 x i64> %x, %shl.y
5698  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5699  ret void
5700}
5701
5702define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
5703; CHECK-LABEL: @urem_i64_oddk_denom(
5704; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
5705; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
5706; CHECK-NEXT:    ret void
5707;
5708; GCN-LABEL: urem_i64_oddk_denom:
5709; GCN:       ; %bb.0:
5710; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
5711; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
5712; GCN-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
5713; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5714; GCN-NEXT:    s_movk_i32 s2, 0xfee0
5715; GCN-NEXT:    s_mov_b32 s3, 0x689e0837
5716; GCN-NEXT:    v_mov_b32_e32 v8, 0
5717; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5718; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5719; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5720; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5721; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5722; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5723; GCN-NEXT:    v_mov_b32_e32 v7, 0
5724; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5725; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
5726; GCN-NEXT:    v_mul_hi_u32 v3, v0, s3
5727; GCN-NEXT:    v_mul_lo_u32 v4, v1, s3
5728; GCN-NEXT:    s_movk_i32 s12, 0x11f
5729; GCN-NEXT:    s_mov_b32 s13, 0x9761f7c9
5730; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5731; GCN-NEXT:    v_mul_lo_u32 v3, v0, s3
5732; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5733; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
5734; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
5735; GCN-NEXT:    v_mul_hi_u32 v6, v0, v3
5736; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
5737; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
5738; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5739; GCN-NEXT:    s_mov_b32 s9, s5
5740; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5741; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
5742; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
5743; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
5744; GCN-NEXT:    s_movk_i32 s5, 0x11e
5745; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5746; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
5747; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
5748; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5749; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
5750; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5751; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
5752; GCN-NEXT:    v_mul_hi_u32 v5, v0, s3
5753; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
5754; GCN-NEXT:    v_mul_lo_u32 v6, v2, s3
5755; GCN-NEXT:    s_mov_b32 s8, s4
5756; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
5757; GCN-NEXT:    v_mul_lo_u32 v5, v0, s3
5758; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
5759; GCN-NEXT:    v_mul_lo_u32 v6, v0, v4
5760; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
5761; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
5762; GCN-NEXT:    v_mul_hi_u32 v11, v2, v4
5763; GCN-NEXT:    s_mov_b32 s4, 0x9761f7c8
5764; GCN-NEXT:    s_mov_b32 s11, 0xf000
5765; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
5766; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
5767; GCN-NEXT:    v_mul_lo_u32 v10, v2, v5
5768; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
5769; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
5770; GCN-NEXT:    s_mov_b32 s10, -1
5771; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
5772; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
5773; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
5774; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
5775; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
5776; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5777; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
5778; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5779; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5780; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
5781; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
5782; GCN-NEXT:    v_mul_hi_u32 v4, s6, v1
5783; GCN-NEXT:    v_mul_hi_u32 v5, s7, v1
5784; GCN-NEXT:    v_mul_lo_u32 v1, s7, v1
5785; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5786; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5787; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
5788; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
5789; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5790; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
5791; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
5792; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5793; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
5794; GCN-NEXT:    v_mul_lo_u32 v2, v0, s12
5795; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
5796; GCN-NEXT:    v_mul_lo_u32 v1, v1, s13
5797; GCN-NEXT:    v_mul_lo_u32 v0, v0, s13
5798; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5799; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
5800; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
5801; GCN-NEXT:    v_mov_b32_e32 v3, s12
5802; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
5803; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
5804; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s13, v0
5805; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
5806; GCN-NEXT:    v_cmp_lt_u32_e64 s[2:3], s5, v5
5807; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
5808; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
5809; GCN-NEXT:    v_cmp_lt_u32_e64 s[2:3], s4, v4
5810; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s13, v4
5811; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
5812; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, v5
5813; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
5814; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
5815; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
5816; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
5817; GCN-NEXT:    v_mov_b32_e32 v5, s7
5818; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
5819; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v1
5820; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
5821; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
5822; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
5823; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v1
5824; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
5825; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
5826; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5827; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
5828; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5829; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
5830; GCN-NEXT:    s_endpgm
5831  %r = urem i64 %x, 1235195393993
5832  store i64 %r, i64 addrspace(1)* %out
5833  ret void
5834}
5835
5836define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
5837; CHECK-LABEL: @urem_i64_pow2k_denom(
5838; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
5839; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
5840; CHECK-NEXT:    ret void
5841;
5842; GCN-LABEL: urem_i64_pow2k_denom:
5843; GCN:       ; %bb.0:
5844; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5845; GCN-NEXT:    s_mov_b32 s3, 0xf000
5846; GCN-NEXT:    s_mov_b32 s2, -1
5847; GCN-NEXT:    v_mov_b32_e32 v1, 0
5848; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5849; GCN-NEXT:    s_mov_b32 s0, s4
5850; GCN-NEXT:    s_and_b32 s4, s6, 0xfff
5851; GCN-NEXT:    s_mov_b32 s1, s5
5852; GCN-NEXT:    v_mov_b32_e32 v0, s4
5853; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5854; GCN-NEXT:    s_endpgm
5855  %r = urem i64 %x, 4096
5856  store i64 %r, i64 addrspace(1)* %out
5857  ret void
5858}
5859
5860define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
5861; CHECK-LABEL: @urem_i64_pow2_shl_denom(
5862; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
5863; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
5864; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
5865; CHECK-NEXT:    ret void
5866;
5867; GCN-LABEL: urem_i64_pow2_shl_denom:
5868; GCN:       ; %bb.0:
5869; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5870; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
5871; GCN-NEXT:    s_mov_b32 s3, 0xf000
5872; GCN-NEXT:    s_mov_b32 s2, -1
5873; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5874; GCN-NEXT:    s_mov_b32 s0, s4
5875; GCN-NEXT:    s_mov_b32 s1, s5
5876; GCN-NEXT:    s_mov_b32 s5, 0
5877; GCN-NEXT:    s_movk_i32 s4, 0x1000
5878; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
5879; GCN-NEXT:    s_add_u32 s4, s4, -1
5880; GCN-NEXT:    s_addc_u32 s5, s5, -1
5881; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
5882; GCN-NEXT:    v_mov_b32_e32 v0, s4
5883; GCN-NEXT:    v_mov_b32_e32 v1, s5
5884; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5885; GCN-NEXT:    s_endpgm
5886  %shl.y = shl i64 4096, %y
5887  %r = urem i64 %x, %shl.y
5888  store i64 %r, i64 addrspace(1)* %out
5889  ret void
5890}
5891
5892define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5893; CHECK-LABEL: @urem_v2i64_pow2k_denom(
5894; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5895; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 4096
5896; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5897; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5898; CHECK-NEXT:    [[TMP5:%.*]] = urem i64 [[TMP4]], 4096
5899; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5900; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]]
5901; CHECK-NEXT:    ret void
5902;
5903; GCN-LABEL: urem_v2i64_pow2k_denom:
5904; GCN:       ; %bb.0:
5905; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5906; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
5907; GCN-NEXT:    s_movk_i32 s8, 0xfff
5908; GCN-NEXT:    v_mov_b32_e32 v1, 0
5909; GCN-NEXT:    s_mov_b32 s7, 0xf000
5910; GCN-NEXT:    s_mov_b32 s6, -1
5911; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5912; GCN-NEXT:    s_and_b32 s0, s0, s8
5913; GCN-NEXT:    s_and_b32 s1, s2, s8
5914; GCN-NEXT:    v_mov_b32_e32 v0, s0
5915; GCN-NEXT:    v_mov_b32_e32 v2, s1
5916; GCN-NEXT:    v_mov_b32_e32 v3, v1
5917; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5918; GCN-NEXT:    s_endpgm
5919  %r = urem <2 x i64> %x, <i64 4096, i64 4096>
5920  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5921  ret void
5922}
5923
5924define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
5925; CHECK-LABEL: @urem_v2i64_pow2_shl_denom(
5926; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
5927; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5928; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
5929; CHECK-NEXT:    [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]]
5930; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
5931; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
5932; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
5933; CHECK-NEXT:    [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]]
5934; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
5935; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]]
5936; CHECK-NEXT:    ret void
5937;
5938; GCN-LABEL: urem_v2i64_pow2_shl_denom:
5939; GCN:       ; %bb.0:
5940; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5941; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5942; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
5943; GCN-NEXT:    s_mov_b32 s13, 0
5944; GCN-NEXT:    s_movk_i32 s12, 0x1000
5945; GCN-NEXT:    s_mov_b32 s7, 0xf000
5946; GCN-NEXT:    s_mov_b32 s6, -1
5947; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5948; GCN-NEXT:    s_lshl_b64 s[2:3], s[12:13], s2
5949; GCN-NEXT:    s_lshl_b64 s[0:1], s[12:13], s0
5950; GCN-NEXT:    s_add_u32 s0, s0, -1
5951; GCN-NEXT:    s_addc_u32 s1, s1, -1
5952; GCN-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
5953; GCN-NEXT:    s_add_u32 s2, s2, -1
5954; GCN-NEXT:    s_addc_u32 s3, s3, -1
5955; GCN-NEXT:    s_and_b64 s[2:3], s[10:11], s[2:3]
5956; GCN-NEXT:    v_mov_b32_e32 v0, s0
5957; GCN-NEXT:    v_mov_b32_e32 v1, s1
5958; GCN-NEXT:    v_mov_b32_e32 v2, s2
5959; GCN-NEXT:    v_mov_b32_e32 v3, s3
5960; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5961; GCN-NEXT:    s_endpgm
5962  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
5963  %r = urem <2 x i64> %x, %shl.y
5964  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5965  ret void
5966}
5967
5968define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
5969; CHECK-LABEL: @sdiv_i64_oddk_denom(
5970; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
5971; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
5972; CHECK-NEXT:    ret void
5973;
5974; GCN-LABEL: sdiv_i64_oddk_denom:
5975; GCN:       ; %bb.0:
5976; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
5977; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
5978; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5979; GCN-NEXT:    s_mov_b32 s2, 0xffed2705
5980; GCN-NEXT:    v_mov_b32_e32 v8, 0
5981; GCN-NEXT:    v_mov_b32_e32 v7, 0
5982; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5983; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5984; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5985; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5986; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5987; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5988; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
5989; GCN-NEXT:    s_mov_b32 s7, 0xf000
5990; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
5991; GCN-NEXT:    v_mul_lo_u32 v2, v1, s2
5992; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
5993; GCN-NEXT:    s_mov_b32 s6, -1
5994; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5995; GCN-NEXT:    s_mov_b32 s4, s8
5996; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5997; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
5998; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
5999; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
6000; GCN-NEXT:    v_mul_hi_u32 v3, v0, v2
6001; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
6002; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6003; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
6004; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
6005; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
6006; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
6007; GCN-NEXT:    s_mov_b32 s5, s9
6008; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
6009; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
6010; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
6011; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6012; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
6013; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
6014; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
6015; GCN-NEXT:    v_mul_lo_u32 v4, v2, s2
6016; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
6017; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
6018; GCN-NEXT:    v_mul_lo_u32 v5, v0, s2
6019; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
6020; GCN-NEXT:    v_mul_lo_u32 v10, v0, v4
6021; GCN-NEXT:    v_mul_hi_u32 v12, v0, v4
6022; GCN-NEXT:    v_mul_hi_u32 v11, v0, v5
6023; GCN-NEXT:    v_mul_hi_u32 v9, v2, v5
6024; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
6025; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
6026; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6027; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
6028; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
6029; GCN-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
6030; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
6031; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
6032; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6033; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
6034; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6035; GCN-NEXT:    s_ashr_i32 s2, s11, 31
6036; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
6037; GCN-NEXT:    s_add_u32 s0, s10, s2
6038; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6039; GCN-NEXT:    s_mov_b32 s3, s2
6040; GCN-NEXT:    s_addc_u32 s1, s11, s2
6041; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
6042; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6043; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
6044; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
6045; GCN-NEXT:    v_mul_hi_u32 v4, s0, v1
6046; GCN-NEXT:    v_mul_hi_u32 v5, s1, v1
6047; GCN-NEXT:    v_mul_lo_u32 v1, s1, v1
6048; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6049; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
6050; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
6051; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
6052; GCN-NEXT:    s_mov_b32 s3, 0x12d8fb
6053; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
6054; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6055; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
6056; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6057; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
6058; GCN-NEXT:    v_mul_lo_u32 v2, v1, s3
6059; GCN-NEXT:    v_mul_hi_u32 v3, s3, v0
6060; GCN-NEXT:    v_mul_lo_u32 v4, v0, s3
6061; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6062; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
6063; GCN-NEXT:    v_mov_b32_e32 v3, s1
6064; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
6065; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v4
6066; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
6067; GCN-NEXT:    s_mov_b32 s0, 0x12d8fa
6068; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
6069; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
6070; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
6071; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
6072; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
6073; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
6074; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
6075; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
6076; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
6077; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
6078; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
6079; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
6080; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
6081; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
6082; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
6083; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
6084; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
6085; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
6086; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
6087; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
6088; GCN-NEXT:    v_mov_b32_e32 v2, s2
6089; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
6090; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
6091; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6092; GCN-NEXT:    s_endpgm
6093  %r = sdiv i64 %x, 1235195
6094  store i64 %r, i64 addrspace(1)* %out
6095  ret void
6096}
6097
6098define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
6099; CHECK-LABEL: @sdiv_i64_pow2k_denom(
6100; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
6101; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
6102; CHECK-NEXT:    ret void
6103;
6104; GCN-LABEL: sdiv_i64_pow2k_denom:
6105; GCN:       ; %bb.0:
6106; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
6107; GCN-NEXT:    s_mov_b32 s3, 0xf000
6108; GCN-NEXT:    s_mov_b32 s2, -1
6109; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6110; GCN-NEXT:    s_mov_b32 s0, s4
6111; GCN-NEXT:    s_ashr_i32 s4, s7, 31
6112; GCN-NEXT:    s_lshr_b32 s4, s4, 20
6113; GCN-NEXT:    s_add_u32 s4, s6, s4
6114; GCN-NEXT:    s_mov_b32 s1, s5
6115; GCN-NEXT:    s_addc_u32 s5, s7, 0
6116; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
6117; GCN-NEXT:    v_mov_b32_e32 v0, s4
6118; GCN-NEXT:    v_mov_b32_e32 v1, s5
6119; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6120; GCN-NEXT:    s_endpgm
6121  %r = sdiv i64 %x, 4096
6122  store i64 %r, i64 addrspace(1)* %out
6123  ret void
6124}
6125
6126define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
6127; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
6128; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
6129; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
6130; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
6131; CHECK-NEXT:    ret void
6132;
6133; GCN-LABEL: sdiv_i64_pow2_shl_denom:
6134; GCN:       ; %bb.0:
6135; GCN-NEXT:    s_load_dword s4, s[0:1], 0xd
6136; GCN-NEXT:    s_mov_b32 s3, 0
6137; GCN-NEXT:    s_movk_i32 s2, 0x1000
6138; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
6139; GCN-NEXT:    s_mov_b32 s7, 0xf000
6140; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6141; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
6142; GCN-NEXT:    s_ashr_i32 s12, s3, 31
6143; GCN-NEXT:    s_add_u32 s2, s2, s12
6144; GCN-NEXT:    s_mov_b32 s13, s12
6145; GCN-NEXT:    s_addc_u32 s3, s3, s12
6146; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
6147; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
6148; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
6149; GCN-NEXT:    s_sub_u32 s4, 0, s2
6150; GCN-NEXT:    s_subb_u32 s5, 0, s3
6151; GCN-NEXT:    s_ashr_i32 s14, s11, 31
6152; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
6153; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6154; GCN-NEXT:    s_mov_b32 s15, s14
6155; GCN-NEXT:    s_mov_b32 s6, -1
6156; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
6157; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
6158; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6159; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
6160; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6161; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6162; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
6163; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
6164; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
6165; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
6166; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6167; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
6168; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
6169; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
6170; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
6171; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
6172; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6173; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
6174; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
6175; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
6176; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
6177; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
6178; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
6179; GCN-NEXT:    v_mov_b32_e32 v4, 0
6180; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
6181; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6182; GCN-NEXT:    v_mov_b32_e32 v6, 0
6183; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
6184; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
6185; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
6186; GCN-NEXT:    v_mul_lo_u32 v5, s4, v2
6187; GCN-NEXT:    v_mul_hi_u32 v7, s4, v0
6188; GCN-NEXT:    v_mul_lo_u32 v8, s5, v0
6189; GCN-NEXT:    s_mov_b32 s5, s9
6190; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6191; GCN-NEXT:    v_mul_lo_u32 v7, s4, v0
6192; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
6193; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
6194; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
6195; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
6196; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
6197; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
6198; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
6199; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6200; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
6201; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
6202; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
6203; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
6204; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
6205; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6206; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
6207; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6208; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
6209; GCN-NEXT:    s_add_u32 s0, s10, s14
6210; GCN-NEXT:    s_addc_u32 s1, s11, s14
6211; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6212; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
6213; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6214; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
6215; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
6216; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
6217; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
6218; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
6219; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6220; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6221; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
6222; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
6223; GCN-NEXT:    s_mov_b32 s4, s8
6224; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6225; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6226; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6227; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6228; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6229; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
6230; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
6231; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
6232; GCN-NEXT:    v_mov_b32_e32 v5, s3
6233; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6234; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
6235; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
6236; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
6237; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
6238; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
6239; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
6240; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
6241; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
6242; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
6243; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
6244; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
6245; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
6246; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
6247; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
6248; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
6249; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
6250; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
6251; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
6252; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
6253; GCN-NEXT:    v_mov_b32_e32 v6, s11
6254; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
6255; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
6256; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
6257; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
6258; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
6259; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
6260; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
6261; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
6262; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
6263; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6264; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[12:13]
6265; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
6266; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
6267; GCN-NEXT:    v_xor_b32_e32 v1, s1, v1
6268; GCN-NEXT:    v_mov_b32_e32 v2, s1
6269; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
6270; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
6271; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6272; GCN-NEXT:    s_endpgm
6273  %shl.y = shl i64 4096, %y
6274  %r = sdiv i64 %x, %shl.y
6275  store i64 %r, i64 addrspace(1)* %out
6276  ret void
6277}
6278
6279define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
6280; CHECK-LABEL: @sdiv_v2i64_pow2k_denom(
6281; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
6282; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
6283; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
6284; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
6285; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096
6286; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
6287; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]]
6288; CHECK-NEXT:    ret void
6289;
6290; GCN-LABEL: sdiv_v2i64_pow2k_denom:
6291; GCN:       ; %bb.0:
6292; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6293; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
6294; GCN-NEXT:    s_mov_b32 s7, 0xf000
6295; GCN-NEXT:    s_mov_b32 s6, -1
6296; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6297; GCN-NEXT:    s_ashr_i32 s8, s1, 31
6298; GCN-NEXT:    s_lshr_b32 s8, s8, 20
6299; GCN-NEXT:    s_add_u32 s0, s0, s8
6300; GCN-NEXT:    s_addc_u32 s1, s1, 0
6301; GCN-NEXT:    s_ashr_i32 s8, s3, 31
6302; GCN-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
6303; GCN-NEXT:    s_lshr_b32 s8, s8, 20
6304; GCN-NEXT:    s_add_u32 s2, s2, s8
6305; GCN-NEXT:    s_addc_u32 s3, s3, 0
6306; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
6307; GCN-NEXT:    v_mov_b32_e32 v0, s0
6308; GCN-NEXT:    v_mov_b32_e32 v1, s1
6309; GCN-NEXT:    v_mov_b32_e32 v2, s2
6310; GCN-NEXT:    v_mov_b32_e32 v3, s3
6311; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6312; GCN-NEXT:    s_endpgm
6313  %r = sdiv <2 x i64> %x, <i64 4096, i64 4096>
6314  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6315  ret void
6316}
6317
6318define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
6319; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom(
6320; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
6321; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
6322; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
6323; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
6324; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095
6325; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
6326; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]]
6327; CHECK-NEXT:    ret void
6328;
6329; GCN-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
6330; GCN:       ; %bb.0:
6331; GCN-NEXT:    v_mov_b32_e32 v0, 0x457ff000
6332; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
6333; GCN-NEXT:    v_mac_f32_e32 v0, 0, v1
6334; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6335; GCN-NEXT:    s_movk_i32 s6, 0xf001
6336; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6337; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
6338; GCN-NEXT:    s_mov_b32 s7, 0xf000
6339; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
6340; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
6341; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6342; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
6343; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6344; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6345; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6346; GCN-NEXT:    s_ashr_i32 s0, s9, 31
6347; GCN-NEXT:    s_lshr_b32 s0, s0, 20
6348; GCN-NEXT:    v_mul_hi_u32 v2, s6, v0
6349; GCN-NEXT:    v_mul_lo_u32 v3, v1, s6
6350; GCN-NEXT:    s_add_u32 s2, s8, s0
6351; GCN-NEXT:    s_addc_u32 s3, s9, 0
6352; GCN-NEXT:    s_ashr_i32 s8, s11, 31
6353; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6354; GCN-NEXT:    v_mul_lo_u32 v3, v0, s6
6355; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
6356; GCN-NEXT:    v_mul_lo_u32 v4, v0, v2
6357; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
6358; GCN-NEXT:    v_mul_hi_u32 v5, v0, v3
6359; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
6360; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6361; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
6362; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
6363; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
6364; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
6365; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
6366; GCN-NEXT:    s_mov_b32 s9, s8
6367; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
6368; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
6369; GCN-NEXT:    v_mov_b32_e32 v4, 0
6370; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
6371; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6372; GCN-NEXT:    v_mov_b32_e32 v6, 0
6373; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
6374; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
6375; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
6376; GCN-NEXT:    v_mul_lo_u32 v5, v2, s6
6377; GCN-NEXT:    v_mul_hi_u32 v7, s6, v0
6378; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6379; GCN-NEXT:    v_mul_lo_u32 v7, v0, s6
6380; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
6381; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
6382; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
6383; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
6384; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
6385; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
6386; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
6387; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6388; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
6389; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
6390; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
6391; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
6392; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
6393; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6394; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
6395; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6396; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
6397; GCN-NEXT:    s_add_u32 s0, s10, s8
6398; GCN-NEXT:    s_addc_u32 s1, s11, s8
6399; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6400; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
6401; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6402; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
6403; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
6404; GCN-NEXT:    v_mul_hi_u32 v5, s0, v1
6405; GCN-NEXT:    v_mul_hi_u32 v7, s1, v1
6406; GCN-NEXT:    v_mul_lo_u32 v1, s1, v1
6407; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6408; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6409; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
6410; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
6411; GCN-NEXT:    s_movk_i32 s9, 0xfff
6412; GCN-NEXT:    s_mov_b32 s6, -1
6413; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6414; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6415; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6416; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6417; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6418; GCN-NEXT:    v_mul_lo_u32 v2, v1, s9
6419; GCN-NEXT:    v_mul_hi_u32 v3, s9, v0
6420; GCN-NEXT:    v_mul_lo_u32 v4, v0, s9
6421; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6422; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
6423; GCN-NEXT:    v_mov_b32_e32 v3, s1
6424; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
6425; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v4
6426; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
6427; GCN-NEXT:    s_movk_i32 s0, 0xffe
6428; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
6429; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
6430; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
6431; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
6432; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
6433; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
6434; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
6435; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
6436; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
6437; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
6438; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
6439; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
6440; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
6441; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
6442; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
6443; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
6444; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
6445; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
6446; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
6447; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
6448; GCN-NEXT:    v_xor_b32_e32 v1, s8, v1
6449; GCN-NEXT:    v_mov_b32_e32 v3, s8
6450; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
6451; GCN-NEXT:    v_mov_b32_e32 v0, s2
6452; GCN-NEXT:    v_mov_b32_e32 v1, s3
6453; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6454; GCN-NEXT:    s_endpgm
6455  %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
6456  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6457  ret void
6458}
6459
6460define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
6461; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom(
6462; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
6463; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
6464; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
6465; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]]
6466; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
6467; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
6468; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
6469; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]]
6470; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
6471; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]]
6472; CHECK-NEXT:    ret void
6473;
6474; GCN-LABEL: sdiv_v2i64_pow2_shl_denom:
6475; GCN:       ; %bb.0:
6476; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
6477; GCN-NEXT:    s_mov_b32 s3, 0
6478; GCN-NEXT:    s_movk_i32 s2, 0x1000
6479; GCN-NEXT:    s_mov_b32 s18, 0x4f800000
6480; GCN-NEXT:    s_mov_b32 s19, 0x5f7ffffc
6481; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6482; GCN-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
6483; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
6484; GCN-NEXT:    s_ashr_i32 s16, s3, 31
6485; GCN-NEXT:    s_add_u32 s2, s2, s16
6486; GCN-NEXT:    s_mov_b32 s17, s16
6487; GCN-NEXT:    s_addc_u32 s3, s3, s16
6488; GCN-NEXT:    s_xor_b64 s[14:15], s[2:3], s[16:17]
6489; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s14
6490; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s15
6491; GCN-NEXT:    s_mov_b32 s20, 0x2f800000
6492; GCN-NEXT:    s_mov_b32 s21, 0xcf800000
6493; GCN-NEXT:    s_sub_u32 s6, 0, s14
6494; GCN-NEXT:    v_mac_f32_e32 v0, s18, v1
6495; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6496; GCN-NEXT:    s_subb_u32 s7, 0, s15
6497; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6498; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
6499; GCN-NEXT:    v_mul_f32_e32 v0, s19, v0
6500; GCN-NEXT:    v_mul_f32_e32 v1, s20, v0
6501; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6502; GCN-NEXT:    v_mac_f32_e32 v0, s21, v1
6503; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6504; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6505; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
6506; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
6507; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
6508; GCN-NEXT:    v_mul_lo_u32 v5, s6, v0
6509; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6510; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
6511; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
6512; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
6513; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
6514; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
6515; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6516; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
6517; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
6518; GCN-NEXT:    v_mul_lo_u32 v6, v1, v5
6519; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
6520; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
6521; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
6522; GCN-NEXT:    v_mov_b32_e32 v4, 0
6523; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
6524; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6525; GCN-NEXT:    v_mov_b32_e32 v6, 0
6526; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
6527; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
6528; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
6529; GCN-NEXT:    v_mul_lo_u32 v5, s6, v2
6530; GCN-NEXT:    v_mul_hi_u32 v7, s6, v0
6531; GCN-NEXT:    v_mul_lo_u32 v8, s7, v0
6532; GCN-NEXT:    s_mov_b32 s7, 0xf000
6533; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6534; GCN-NEXT:    v_mul_lo_u32 v7, s6, v0
6535; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
6536; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
6537; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
6538; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
6539; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
6540; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
6541; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
6542; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6543; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
6544; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
6545; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
6546; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
6547; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
6548; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6549; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
6550; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6551; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
6552; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6553; GCN-NEXT:    s_ashr_i32 s2, s9, 31
6554; GCN-NEXT:    s_add_u32 s0, s8, s2
6555; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6556; GCN-NEXT:    s_mov_b32 s3, s2
6557; GCN-NEXT:    s_addc_u32 s1, s9, s2
6558; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
6559; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6560; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
6561; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
6562; GCN-NEXT:    v_mul_hi_u32 v5, s8, v1
6563; GCN-NEXT:    v_mul_hi_u32 v7, s9, v1
6564; GCN-NEXT:    v_mul_lo_u32 v1, s9, v1
6565; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6566; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6567; GCN-NEXT:    v_mul_lo_u32 v5, s9, v0
6568; GCN-NEXT:    v_mul_hi_u32 v0, s9, v0
6569; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
6570; GCN-NEXT:    s_mov_b32 s6, -1
6571; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6572; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6573; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6574; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6575; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6576; GCN-NEXT:    v_mul_lo_u32 v2, s14, v1
6577; GCN-NEXT:    v_mul_hi_u32 v3, s14, v0
6578; GCN-NEXT:    v_mul_lo_u32 v5, s15, v0
6579; GCN-NEXT:    v_mov_b32_e32 v7, s15
6580; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6581; GCN-NEXT:    v_mul_lo_u32 v3, s14, v0
6582; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
6583; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s9, v2
6584; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
6585; GCN-NEXT:    v_subb_u32_e64 v5, s[0:1], v5, v7, vcc
6586; GCN-NEXT:    v_subrev_i32_e64 v7, s[0:1], s14, v3
6587; GCN-NEXT:    v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1]
6588; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v5
6589; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
6590; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
6591; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
6592; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v5
6593; GCN-NEXT:    v_cndmask_b32_e64 v5, v8, v7, s[0:1]
6594; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
6595; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
6596; GCN-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v0
6597; GCN-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
6598; GCN-NEXT:    s_ashr_i32 s8, s13, 31
6599; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
6600; GCN-NEXT:    s_add_u32 s12, s12, s8
6601; GCN-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s[0:1]
6602; GCN-NEXT:    v_mov_b32_e32 v8, s9
6603; GCN-NEXT:    s_mov_b32 s9, s8
6604; GCN-NEXT:    s_addc_u32 s13, s13, s8
6605; GCN-NEXT:    s_xor_b64 s[12:13], s[12:13], s[8:9]
6606; GCN-NEXT:    v_cvt_f32_u32_e32 v10, s12
6607; GCN-NEXT:    v_cvt_f32_u32_e32 v11, s13
6608; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
6609; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
6610; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6611; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
6612; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
6613; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v2
6614; GCN-NEXT:    v_mac_f32_e32 v10, s18, v11
6615; GCN-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
6616; GCN-NEXT:    v_rcp_f32_e32 v3, v10
6617; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
6618; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
6619; GCN-NEXT:    s_sub_u32 s14, 0, s12
6620; GCN-NEXT:    v_mul_f32_e32 v3, s19, v3
6621; GCN-NEXT:    v_mul_f32_e32 v5, s20, v3
6622; GCN-NEXT:    v_trunc_f32_e32 v5, v5
6623; GCN-NEXT:    v_mac_f32_e32 v3, s21, v5
6624; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
6625; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
6626; GCN-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[0:1]
6627; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6628; GCN-NEXT:    v_mul_hi_u32 v2, s14, v3
6629; GCN-NEXT:    v_mul_lo_u32 v7, s14, v5
6630; GCN-NEXT:    s_subb_u32 s15, 0, s13
6631; GCN-NEXT:    v_mul_lo_u32 v8, s15, v3
6632; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
6633; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
6634; GCN-NEXT:    v_mul_lo_u32 v7, s14, v3
6635; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
6636; GCN-NEXT:    v_mul_lo_u32 v8, v3, v2
6637; GCN-NEXT:    v_mul_hi_u32 v10, v3, v2
6638; GCN-NEXT:    v_mul_hi_u32 v9, v3, v7
6639; GCN-NEXT:    v_mul_hi_u32 v11, v5, v2
6640; GCN-NEXT:    v_mul_lo_u32 v2, v5, v2
6641; GCN-NEXT:    v_xor_b32_e32 v1, s3, v1
6642; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
6643; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
6644; GCN-NEXT:    v_mul_lo_u32 v10, v5, v7
6645; GCN-NEXT:    v_mul_hi_u32 v7, v5, v7
6646; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
6647; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
6648; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
6649; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6650; GCN-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
6651; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
6652; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
6653; GCN-NEXT:    v_mul_lo_u32 v8, s14, v3
6654; GCN-NEXT:    v_mul_hi_u32 v9, s14, v2
6655; GCN-NEXT:    v_mul_lo_u32 v10, s15, v2
6656; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
6657; GCN-NEXT:    v_mul_lo_u32 v9, s14, v2
6658; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
6659; GCN-NEXT:    v_mul_lo_u32 v12, v2, v8
6660; GCN-NEXT:    v_mul_hi_u32 v14, v2, v8
6661; GCN-NEXT:    v_mul_hi_u32 v13, v2, v9
6662; GCN-NEXT:    v_mul_hi_u32 v11, v3, v9
6663; GCN-NEXT:    v_mul_lo_u32 v9, v3, v9
6664; GCN-NEXT:    v_mul_hi_u32 v10, v3, v8
6665; GCN-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
6666; GCN-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
6667; GCN-NEXT:    v_mul_lo_u32 v3, v3, v8
6668; GCN-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
6669; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
6670; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v10, v4, vcc
6671; GCN-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
6672; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
6673; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
6674; GCN-NEXT:    s_ashr_i32 s14, s11, 31
6675; GCN-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
6676; GCN-NEXT:    s_add_u32 s0, s10, s14
6677; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6678; GCN-NEXT:    s_mov_b32 s15, s14
6679; GCN-NEXT:    s_addc_u32 s1, s11, s14
6680; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
6681; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6682; GCN-NEXT:    v_mul_lo_u32 v5, s10, v3
6683; GCN-NEXT:    v_mul_hi_u32 v7, s10, v2
6684; GCN-NEXT:    v_mul_hi_u32 v9, s10, v3
6685; GCN-NEXT:    v_mul_hi_u32 v10, s11, v3
6686; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
6687; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6688; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
6689; GCN-NEXT:    v_mul_lo_u32 v9, s11, v2
6690; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
6691; GCN-NEXT:    v_mov_b32_e32 v8, s3
6692; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
6693; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
6694; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
6695; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6696; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
6697; GCN-NEXT:    v_mul_lo_u32 v4, s12, v3
6698; GCN-NEXT:    v_mul_hi_u32 v5, s12, v2
6699; GCN-NEXT:    v_mul_lo_u32 v6, s13, v2
6700; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
6701; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
6702; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
6703; GCN-NEXT:    v_mul_lo_u32 v5, s12, v2
6704; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
6705; GCN-NEXT:    v_sub_i32_e32 v6, vcc, s11, v4
6706; GCN-NEXT:    v_mov_b32_e32 v7, s13
6707; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s10, v5
6708; GCN-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
6709; GCN-NEXT:    v_subrev_i32_e64 v7, s[0:1], s12, v5
6710; GCN-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
6711; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v6
6712; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
6713; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
6714; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
6715; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v6
6716; GCN-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
6717; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
6718; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
6719; GCN-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
6720; GCN-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
6721; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
6722; GCN-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
6723; GCN-NEXT:    v_mov_b32_e32 v8, s11
6724; GCN-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
6725; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v4
6726; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6727; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
6728; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
6729; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v4
6730; GCN-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
6731; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
6732; GCN-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
6733; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
6734; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[8:9]
6735; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
6736; GCN-NEXT:    v_xor_b32_e32 v2, s0, v2
6737; GCN-NEXT:    v_xor_b32_e32 v3, s1, v3
6738; GCN-NEXT:    v_mov_b32_e32 v4, s1
6739; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v2
6740; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
6741; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6742; GCN-NEXT:    s_endpgm
6743  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
6744  %r = sdiv <2 x i64> %x, %shl.y
6745  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6746  ret void
6747}
6748
6749define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
6750; CHECK-LABEL: @srem_i64_oddk_denom(
6751; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
6752; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
6753; CHECK-NEXT:    ret void
6754;
6755; GCN-LABEL: srem_i64_oddk_denom:
6756; GCN:       ; %bb.0:
6757; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
6758; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
6759; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6760; GCN-NEXT:    s_mov_b32 s2, 0xffed2705
6761; GCN-NEXT:    v_mov_b32_e32 v8, 0
6762; GCN-NEXT:    v_mov_b32_e32 v7, 0
6763; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
6764; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
6765; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6766; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
6767; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6768; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6769; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
6770; GCN-NEXT:    s_mov_b32 s7, 0xf000
6771; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
6772; GCN-NEXT:    v_mul_lo_u32 v2, v1, s2
6773; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
6774; GCN-NEXT:    s_mov_b32 s6, -1
6775; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6776; GCN-NEXT:    s_mov_b32 s4, s8
6777; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6778; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
6779; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
6780; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
6781; GCN-NEXT:    v_mul_hi_u32 v3, v0, v2
6782; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
6783; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6784; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
6785; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
6786; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
6787; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
6788; GCN-NEXT:    s_mov_b32 s5, s9
6789; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
6790; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
6791; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
6792; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6793; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
6794; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
6795; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
6796; GCN-NEXT:    v_mul_lo_u32 v4, v2, s2
6797; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
6798; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
6799; GCN-NEXT:    v_mul_lo_u32 v5, v0, s2
6800; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
6801; GCN-NEXT:    v_mul_lo_u32 v10, v0, v4
6802; GCN-NEXT:    v_mul_hi_u32 v12, v0, v4
6803; GCN-NEXT:    v_mul_hi_u32 v11, v0, v5
6804; GCN-NEXT:    v_mul_hi_u32 v9, v2, v5
6805; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
6806; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
6807; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6808; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
6809; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
6810; GCN-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
6811; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
6812; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
6813; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6814; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
6815; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6816; GCN-NEXT:    s_ashr_i32 s2, s11, 31
6817; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
6818; GCN-NEXT:    s_add_u32 s0, s10, s2
6819; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6820; GCN-NEXT:    s_mov_b32 s3, s2
6821; GCN-NEXT:    s_addc_u32 s1, s11, s2
6822; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
6823; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6824; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
6825; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
6826; GCN-NEXT:    v_mul_hi_u32 v4, s0, v1
6827; GCN-NEXT:    v_mul_hi_u32 v5, s1, v1
6828; GCN-NEXT:    v_mul_lo_u32 v1, s1, v1
6829; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6830; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
6831; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
6832; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
6833; GCN-NEXT:    s_mov_b32 s3, 0x12d8fb
6834; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
6835; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6836; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
6837; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6838; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
6839; GCN-NEXT:    v_mul_hi_u32 v2, s3, v0
6840; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
6841; GCN-NEXT:    v_mul_lo_u32 v0, v0, s3
6842; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6843; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
6844; GCN-NEXT:    v_mov_b32_e32 v2, s1
6845; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
6846; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v0
6847; GCN-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
6848; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v2
6849; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
6850; GCN-NEXT:    s_mov_b32 s0, 0x12d8fa
6851; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
6852; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
6853; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6854; GCN-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
6855; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
6856; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
6857; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
6858; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
6859; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
6860; GCN-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
6861; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
6862; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
6863; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
6864; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
6865; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
6866; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
6867; GCN-NEXT:    v_mov_b32_e32 v2, s2
6868; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
6869; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
6870; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6871; GCN-NEXT:    s_endpgm
6872  %r = srem i64 %x, 1235195
6873  store i64 %r, i64 addrspace(1)* %out
6874  ret void
6875}
6876
6877define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
6878; CHECK-LABEL: @srem_i64_pow2k_denom(
6879; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
6880; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
6881; CHECK-NEXT:    ret void
6882;
6883; GCN-LABEL: srem_i64_pow2k_denom:
6884; GCN:       ; %bb.0:
6885; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
6886; GCN-NEXT:    s_mov_b32 s3, 0xf000
6887; GCN-NEXT:    s_mov_b32 s2, -1
6888; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6889; GCN-NEXT:    s_mov_b32 s0, s4
6890; GCN-NEXT:    s_ashr_i32 s4, s7, 31
6891; GCN-NEXT:    s_lshr_b32 s4, s4, 20
6892; GCN-NEXT:    s_add_u32 s4, s6, s4
6893; GCN-NEXT:    s_mov_b32 s1, s5
6894; GCN-NEXT:    s_addc_u32 s5, s7, 0
6895; GCN-NEXT:    s_and_b32 s4, s4, 0xfffff000
6896; GCN-NEXT:    s_sub_u32 s4, s6, s4
6897; GCN-NEXT:    s_subb_u32 s5, s7, s5
6898; GCN-NEXT:    v_mov_b32_e32 v0, s4
6899; GCN-NEXT:    v_mov_b32_e32 v1, s5
6900; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6901; GCN-NEXT:    s_endpgm
6902  %r = srem i64 %x, 4096
6903  store i64 %r, i64 addrspace(1)* %out
6904  ret void
6905}
6906
6907define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
6908; CHECK-LABEL: @srem_i64_pow2_shl_denom(
6909; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
6910; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
6911; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
6912; CHECK-NEXT:    ret void
6913;
6914; GCN-LABEL: srem_i64_pow2_shl_denom:
6915; GCN:       ; %bb.0:
6916; GCN-NEXT:    s_load_dword s4, s[0:1], 0xd
6917; GCN-NEXT:    s_mov_b32 s3, 0
6918; GCN-NEXT:    s_movk_i32 s2, 0x1000
6919; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
6920; GCN-NEXT:    s_mov_b32 s7, 0xf000
6921; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6922; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
6923; GCN-NEXT:    s_ashr_i32 s4, s3, 31
6924; GCN-NEXT:    s_add_u32 s2, s2, s4
6925; GCN-NEXT:    s_mov_b32 s5, s4
6926; GCN-NEXT:    s_addc_u32 s3, s3, s4
6927; GCN-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
6928; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
6929; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
6930; GCN-NEXT:    s_sub_u32 s2, 0, s12
6931; GCN-NEXT:    s_subb_u32 s3, 0, s13
6932; GCN-NEXT:    s_ashr_i32 s14, s11, 31
6933; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
6934; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6935; GCN-NEXT:    s_mov_b32 s15, s14
6936; GCN-NEXT:    s_mov_b32 s6, -1
6937; GCN-NEXT:    s_mov_b32 s4, s8
6938; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
6939; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
6940; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6941; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
6942; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6943; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6944; GCN-NEXT:    s_mov_b32 s5, s9
6945; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
6946; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
6947; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
6948; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
6949; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6950; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
6951; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
6952; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
6953; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
6954; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
6955; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6956; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
6957; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
6958; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
6959; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
6960; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
6961; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
6962; GCN-NEXT:    v_mov_b32_e32 v4, 0
6963; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
6964; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6965; GCN-NEXT:    v_mov_b32_e32 v6, 0
6966; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
6967; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
6968; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
6969; GCN-NEXT:    v_mul_lo_u32 v5, s2, v2
6970; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
6971; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
6972; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6973; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
6974; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
6975; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
6976; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
6977; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
6978; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
6979; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
6980; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
6981; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6982; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
6983; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
6984; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
6985; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
6986; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
6987; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6988; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
6989; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6990; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
6991; GCN-NEXT:    s_add_u32 s0, s10, s14
6992; GCN-NEXT:    s_addc_u32 s1, s11, s14
6993; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6994; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
6995; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6996; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
6997; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
6998; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
6999; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
7000; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
7001; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7002; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
7003; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
7004; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
7005; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
7006; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
7007; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
7008; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7009; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
7010; GCN-NEXT:    v_mul_lo_u32 v1, s12, v1
7011; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
7012; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
7013; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
7014; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
7015; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
7016; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
7017; GCN-NEXT:    v_mov_b32_e32 v3, s13
7018; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
7019; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
7020; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
7021; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
7022; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
7023; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
7024; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
7025; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
7026; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
7027; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
7028; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
7029; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
7030; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
7031; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
7032; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
7033; GCN-NEXT:    v_mov_b32_e32 v5, s11
7034; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
7035; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
7036; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
7037; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
7038; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7039; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
7040; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
7041; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
7042; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
7043; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
7044; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7045; GCN-NEXT:    v_xor_b32_e32 v0, s14, v0
7046; GCN-NEXT:    v_xor_b32_e32 v1, s14, v1
7047; GCN-NEXT:    v_mov_b32_e32 v2, s14
7048; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v0
7049; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
7050; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7051; GCN-NEXT:    s_endpgm
7052  %shl.y = shl i64 4096, %y
7053  %r = srem i64 %x, %shl.y
7054  store i64 %r, i64 addrspace(1)* %out
7055  ret void
7056}
7057
7058define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
7059; CHECK-LABEL: @srem_v2i64_pow2k_denom(
7060; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7061; CHECK-NEXT:    [[TMP2:%.*]] = srem i64 [[TMP1]], 4096
7062; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
7063; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7064; CHECK-NEXT:    [[TMP5:%.*]] = srem i64 [[TMP4]], 4096
7065; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7066; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]]
7067; CHECK-NEXT:    ret void
7068;
7069; GCN-LABEL: srem_v2i64_pow2k_denom:
7070; GCN:       ; %bb.0:
7071; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7072; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
7073; GCN-NEXT:    s_movk_i32 s8, 0xf000
7074; GCN-NEXT:    s_mov_b32 s7, 0xf000
7075; GCN-NEXT:    s_mov_b32 s6, -1
7076; GCN-NEXT:    s_waitcnt lgkmcnt(0)
7077; GCN-NEXT:    s_ashr_i32 s9, s1, 31
7078; GCN-NEXT:    s_lshr_b32 s9, s9, 20
7079; GCN-NEXT:    s_add_u32 s9, s0, s9
7080; GCN-NEXT:    s_addc_u32 s10, s1, 0
7081; GCN-NEXT:    s_and_b32 s9, s9, s8
7082; GCN-NEXT:    s_sub_u32 s0, s0, s9
7083; GCN-NEXT:    s_subb_u32 s1, s1, s10
7084; GCN-NEXT:    s_ashr_i32 s9, s3, 31
7085; GCN-NEXT:    s_lshr_b32 s9, s9, 20
7086; GCN-NEXT:    s_add_u32 s9, s2, s9
7087; GCN-NEXT:    s_addc_u32 s10, s3, 0
7088; GCN-NEXT:    s_and_b32 s8, s9, s8
7089; GCN-NEXT:    s_sub_u32 s2, s2, s8
7090; GCN-NEXT:    s_subb_u32 s3, s3, s10
7091; GCN-NEXT:    v_mov_b32_e32 v0, s0
7092; GCN-NEXT:    v_mov_b32_e32 v1, s1
7093; GCN-NEXT:    v_mov_b32_e32 v2, s2
7094; GCN-NEXT:    v_mov_b32_e32 v3, s3
7095; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
7096; GCN-NEXT:    s_endpgm
7097  %r = srem <2 x i64> %x, <i64 4096, i64 4096>
7098  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
7099  ret void
7100}
7101
7102define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
7103; CHECK-LABEL: @srem_v2i64_pow2_shl_denom(
7104; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
7105; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7106; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
7107; CHECK-NEXT:    [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]]
7108; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
7109; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
7110; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
7111; CHECK-NEXT:    [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]]
7112; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
7113; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]]
7114; CHECK-NEXT:    ret void
7115;
7116; GCN-LABEL: srem_v2i64_pow2_shl_denom:
7117; GCN:       ; %bb.0:
7118; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
7119; GCN-NEXT:    s_mov_b32 s3, 0
7120; GCN-NEXT:    s_movk_i32 s2, 0x1000
7121; GCN-NEXT:    s_mov_b32 s18, 0x4f800000
7122; GCN-NEXT:    s_mov_b32 s19, 0x5f7ffffc
7123; GCN-NEXT:    s_waitcnt lgkmcnt(0)
7124; GCN-NEXT:    s_lshl_b64 s[14:15], s[2:3], s6
7125; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
7126; GCN-NEXT:    s_ashr_i32 s4, s3, 31
7127; GCN-NEXT:    s_add_u32 s2, s2, s4
7128; GCN-NEXT:    s_mov_b32 s5, s4
7129; GCN-NEXT:    s_addc_u32 s3, s3, s4
7130; GCN-NEXT:    s_xor_b64 s[16:17], s[2:3], s[4:5]
7131; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s16
7132; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s17
7133; GCN-NEXT:    s_mov_b32 s20, 0x2f800000
7134; GCN-NEXT:    s_mov_b32 s21, 0xcf800000
7135; GCN-NEXT:    s_sub_u32 s6, 0, s16
7136; GCN-NEXT:    v_mac_f32_e32 v0, s18, v1
7137; GCN-NEXT:    v_rcp_f32_e32 v0, v0
7138; GCN-NEXT:    s_subb_u32 s7, 0, s17
7139; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7140; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
7141; GCN-NEXT:    v_mul_f32_e32 v0, s19, v0
7142; GCN-NEXT:    v_mul_f32_e32 v1, s20, v0
7143; GCN-NEXT:    v_trunc_f32_e32 v1, v1
7144; GCN-NEXT:    v_mac_f32_e32 v0, s21, v1
7145; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
7146; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
7147; GCN-NEXT:    s_waitcnt lgkmcnt(0)
7148; GCN-NEXT:    s_ashr_i32 s12, s9, 31
7149; GCN-NEXT:    s_add_u32 s0, s8, s12
7150; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
7151; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
7152; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
7153; GCN-NEXT:    v_mul_lo_u32 v5, s6, v0
7154; GCN-NEXT:    s_mov_b32 s13, s12
7155; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7156; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7157; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
7158; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
7159; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
7160; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
7161; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
7162; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
7163; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
7164; GCN-NEXT:    v_mul_lo_u32 v6, v1, v5
7165; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
7166; GCN-NEXT:    s_addc_u32 s1, s9, s12
7167; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
7168; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
7169; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
7170; GCN-NEXT:    v_mov_b32_e32 v4, 0
7171; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
7172; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7173; GCN-NEXT:    v_mov_b32_e32 v6, 0
7174; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
7175; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
7176; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
7177; GCN-NEXT:    v_mul_lo_u32 v5, s6, v2
7178; GCN-NEXT:    v_mul_hi_u32 v7, s6, v0
7179; GCN-NEXT:    v_mul_lo_u32 v8, s7, v0
7180; GCN-NEXT:    s_mov_b32 s7, 0xf000
7181; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
7182; GCN-NEXT:    v_mul_lo_u32 v7, s6, v0
7183; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
7184; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
7185; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
7186; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
7187; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
7188; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
7189; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
7190; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
7191; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
7192; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
7193; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
7194; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
7195; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
7196; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
7197; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
7198; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
7199; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
7200; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7201; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7202; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
7203; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
7204; GCN-NEXT:    v_mul_hi_u32 v5, s8, v1
7205; GCN-NEXT:    v_mul_hi_u32 v7, s9, v1
7206; GCN-NEXT:    v_mul_lo_u32 v1, s9, v1
7207; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7208; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
7209; GCN-NEXT:    v_mul_lo_u32 v5, s9, v0
7210; GCN-NEXT:    v_mul_hi_u32 v0, s9, v0
7211; GCN-NEXT:    s_mov_b32 s6, -1
7212; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
7213; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
7214; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
7215; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7216; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
7217; GCN-NEXT:    v_mul_lo_u32 v1, s16, v1
7218; GCN-NEXT:    v_mul_hi_u32 v2, s16, v0
7219; GCN-NEXT:    v_mul_lo_u32 v3, s17, v0
7220; GCN-NEXT:    v_mul_lo_u32 v0, s16, v0
7221; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
7222; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
7223; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s9, v1
7224; GCN-NEXT:    v_mov_b32_e32 v3, s17
7225; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
7226; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
7227; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s16, v0
7228; GCN-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1]
7229; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s17, v7
7230; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
7231; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
7232; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v5
7233; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s16, v5
7234; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
7235; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s17, v7
7236; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
7237; GCN-NEXT:    s_ashr_i32 s2, s15, 31
7238; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
7239; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
7240; GCN-NEXT:    s_add_u32 s8, s14, s2
7241; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
7242; GCN-NEXT:    v_mov_b32_e32 v7, s9
7243; GCN-NEXT:    s_mov_b32 s3, s2
7244; GCN-NEXT:    s_addc_u32 s9, s15, s2
7245; GCN-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
7246; GCN-NEXT:    v_cvt_f32_u32_e32 v8, s8
7247; GCN-NEXT:    v_cvt_f32_u32_e32 v9, s9
7248; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v7, v1, vcc
7249; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
7250; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
7251; GCN-NEXT:    v_mac_f32_e32 v8, s18, v9
7252; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
7253; GCN-NEXT:    v_rcp_f32_e32 v8, v8
7254; GCN-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
7255; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v1
7256; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
7257; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
7258; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
7259; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
7260; GCN-NEXT:    v_mul_f32_e32 v3, s19, v8
7261; GCN-NEXT:    v_mul_f32_e32 v5, s20, v3
7262; GCN-NEXT:    v_trunc_f32_e32 v5, v5
7263; GCN-NEXT:    v_mac_f32_e32 v3, s21, v5
7264; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
7265; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
7266; GCN-NEXT:    s_sub_u32 s2, 0, s8
7267; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7268; GCN-NEXT:    v_mul_hi_u32 v2, s2, v3
7269; GCN-NEXT:    v_mul_lo_u32 v7, s2, v5
7270; GCN-NEXT:    s_subb_u32 s3, 0, s9
7271; GCN-NEXT:    v_mul_lo_u32 v8, s3, v3
7272; GCN-NEXT:    s_ashr_i32 s14, s11, 31
7273; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
7274; GCN-NEXT:    v_mul_lo_u32 v7, s2, v3
7275; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
7276; GCN-NEXT:    v_mul_lo_u32 v8, v3, v2
7277; GCN-NEXT:    v_mul_hi_u32 v10, v3, v2
7278; GCN-NEXT:    v_mul_hi_u32 v9, v3, v7
7279; GCN-NEXT:    v_mul_hi_u32 v11, v5, v2
7280; GCN-NEXT:    v_mul_lo_u32 v2, v5, v2
7281; GCN-NEXT:    s_mov_b32 s15, s14
7282; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
7283; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
7284; GCN-NEXT:    v_mul_lo_u32 v10, v5, v7
7285; GCN-NEXT:    v_mul_hi_u32 v7, v5, v7
7286; GCN-NEXT:    v_xor_b32_e32 v0, s12, v0
7287; GCN-NEXT:    v_xor_b32_e32 v1, s12, v1
7288; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
7289; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
7290; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
7291; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
7292; GCN-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
7293; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
7294; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
7295; GCN-NEXT:    v_mul_lo_u32 v8, s2, v3
7296; GCN-NEXT:    v_mul_hi_u32 v9, s2, v2
7297; GCN-NEXT:    v_mul_lo_u32 v10, s3, v2
7298; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
7299; GCN-NEXT:    v_mul_lo_u32 v9, s2, v2
7300; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
7301; GCN-NEXT:    v_mul_lo_u32 v12, v2, v8
7302; GCN-NEXT:    v_mul_hi_u32 v14, v2, v8
7303; GCN-NEXT:    v_mul_hi_u32 v13, v2, v9
7304; GCN-NEXT:    v_mul_hi_u32 v11, v3, v9
7305; GCN-NEXT:    v_mul_lo_u32 v9, v3, v9
7306; GCN-NEXT:    v_mul_hi_u32 v10, v3, v8
7307; GCN-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
7308; GCN-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
7309; GCN-NEXT:    v_mul_lo_u32 v3, v3, v8
7310; GCN-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
7311; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
7312; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v10, v4, vcc
7313; GCN-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
7314; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
7315; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
7316; GCN-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
7317; GCN-NEXT:    s_add_u32 s0, s10, s14
7318; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
7319; GCN-NEXT:    s_addc_u32 s1, s11, s14
7320; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
7321; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
7322; GCN-NEXT:    v_mul_lo_u32 v5, s10, v3
7323; GCN-NEXT:    v_mul_hi_u32 v7, s10, v2
7324; GCN-NEXT:    v_mul_hi_u32 v9, s10, v3
7325; GCN-NEXT:    v_mul_hi_u32 v10, s11, v3
7326; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
7327; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
7328; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
7329; GCN-NEXT:    v_mul_lo_u32 v9, s11, v2
7330; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
7331; GCN-NEXT:    v_mov_b32_e32 v8, s12
7332; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
7333; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
7334; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
7335; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
7336; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
7337; GCN-NEXT:    v_mul_lo_u32 v3, s8, v3
7338; GCN-NEXT:    v_mul_hi_u32 v4, s8, v2
7339; GCN-NEXT:    v_mul_lo_u32 v5, s9, v2
7340; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
7341; GCN-NEXT:    v_mul_lo_u32 v2, s8, v2
7342; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
7343; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
7344; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
7345; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v3
7346; GCN-NEXT:    v_mov_b32_e32 v5, s9
7347; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
7348; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
7349; GCN-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v2
7350; GCN-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
7351; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v7
7352; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
7353; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
7354; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v6
7355; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v6
7356; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
7357; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v7
7358; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
7359; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
7360; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
7361; GCN-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
7362; GCN-NEXT:    v_mov_b32_e32 v7, s11
7363; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
7364; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
7365; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
7366; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
7367; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
7368; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
7369; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
7370; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
7371; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
7372; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
7373; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
7374; GCN-NEXT:    v_xor_b32_e32 v2, s14, v2
7375; GCN-NEXT:    v_xor_b32_e32 v3, s14, v3
7376; GCN-NEXT:    v_mov_b32_e32 v4, s14
7377; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v2
7378; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
7379; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
7380; GCN-NEXT:    s_endpgm
7381  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
7382  %r = srem <2 x i64> %x, %shl.y
7383  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
7384  ret void
7385}
7386